9811 lines
315 KiB
C
9811 lines
315 KiB
C
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
Copyright (c) 1997-2021 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
|
|
|
|
/* This module contains the external function pcre_compile(), along with
|
|
supporting internal functions that are not used by other modules. */
|
|
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#define NLBLOCK cd /* Block containing newline information */
|
|
#define PSSTART start_pattern /* Field containing pattern start */
|
|
#define PSEND end_pattern /* Field containing pattern end */
|
|
|
|
#include "pcre_internal.h"
|
|
|
|
|
|
/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
|
|
is also used by pcretest. PCRE_DEBUG is not defined when building a production
|
|
library. We do not need to select pcre16_printint.c specially, because the
|
|
COMPILE_PCREx macro will already be appropriately set. */
|
|
|
|
#ifdef PCRE_DEBUG
|
|
/* pcre_printint.c should not include any headers */
|
|
#define PCRE_INCLUDED
|
|
#include "pcre_printint.c"
|
|
#undef PCRE_INCLUDED
|
|
#endif
|
|
|
|
|
|
/* Macro for setting individual bits in class bitmaps. */
|
|
|
|
#define SETBIT(a,b) a[(b)/8] |= (1U << ((b)&7))
|
|
|
|
/* Maximum length value to check against when making sure that the integer that
|
|
holds the compiled pattern length does not overflow. We make it a bit less than
|
|
INT_MAX to allow for adding in group terminating bytes, so that we don't have
|
|
to check them every time. */
|
|
|
|
#define OFLOW_MAX (INT_MAX - 20)
|
|
|
|
/* Definitions to allow mutual recursion */
|
|
|
|
static int
|
|
add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
|
|
const pcre_uint32 *, unsigned int);
|
|
|
|
static BOOL
|
|
compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
|
|
pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
|
|
compile_data *, int *);
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Code parameters and static tables *
|
|
*************************************************/
|
|
|
|
/* This value specifies the size of stack workspace that is used during the
|
|
first pre-compile phase that determines how much memory is required. The regex
|
|
is partly compiled into this space, but the compiled parts are discarded as
|
|
soon as they can be, so that hopefully there will never be an overrun. The code
|
|
does, however, check for an overrun. The largest amount I've seen used is 218,
|
|
so this number is very generous.
|
|
|
|
The same workspace is used during the second, actual compile phase for
|
|
remembering forward references to groups so that they can be filled in at the
|
|
end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
|
|
is 4 there is plenty of room for most patterns. However, the memory can get
|
|
filled up by repetitions of forward references, for example patterns like
|
|
/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
|
|
that the workspace is expanded using malloc() in this situation. The value
|
|
below is therefore a minimum, and we put a maximum on it for safety. The
|
|
minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
|
|
kicks in at the same number of forward references in all cases. */
|
|
|
|
#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
|
|
#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
|
|
|
|
/* This value determines the size of the initial vector that is used for
|
|
remembering named groups during the pre-compile. It is allocated on the stack,
|
|
but if it is too small, it is expanded using malloc(), in a similar way to the
|
|
workspace. The value is the number of slots in the list. */
|
|
|
|
#define NAMED_GROUP_LIST_SIZE 20
|
|
|
|
/* The overrun tests check for a slightly smaller size so that they detect the
|
|
overrun before it actually does run off the end of the data block. */
|
|
|
|
#define WORK_SIZE_SAFETY_MARGIN (100)
|
|
|
|
/* Private flags added to firstchar and reqchar. */
|
|
|
|
#define REQ_CASELESS (1U << 0) /* Indicates caselessness */
|
|
#define REQ_VARY (1U << 1) /* Reqchar followed non-literal item */
|
|
/* Negative values for the firstchar and reqchar flags */
|
|
#define REQ_UNSET (-2)
|
|
#define REQ_NONE (-1)
|
|
|
|
/* Repeated character flags. */
|
|
|
|
#define UTF_LENGTH 0x10000000l /* The char contains its length. */
|
|
|
|
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
|
|
are simple data values; negative values are for special things like \d and so
|
|
on. Zero means further processing is needed (for things like \x), or the escape
|
|
is invalid. */
|
|
|
|
#ifndef EBCDIC
|
|
|
|
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
|
|
in UTF-8 mode. */
|
|
|
|
static const short int escapes[] = {
|
|
0, 0,
|
|
0, 0,
|
|
0, 0,
|
|
0, 0,
|
|
0, 0,
|
|
CHAR_COLON, CHAR_SEMICOLON,
|
|
CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
|
|
CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
|
|
CHAR_COMMERCIAL_AT, -ESC_A,
|
|
-ESC_B, -ESC_C,
|
|
-ESC_D, -ESC_E,
|
|
0, -ESC_G,
|
|
-ESC_H, 0,
|
|
0, -ESC_K,
|
|
0, 0,
|
|
-ESC_N, 0,
|
|
-ESC_P, -ESC_Q,
|
|
-ESC_R, -ESC_S,
|
|
0, 0,
|
|
-ESC_V, -ESC_W,
|
|
-ESC_X, 0,
|
|
-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
|
|
CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
|
|
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
|
|
CHAR_GRAVE_ACCENT, ESC_a,
|
|
-ESC_b, 0,
|
|
-ESC_d, ESC_e,
|
|
ESC_f, 0,
|
|
-ESC_h, 0,
|
|
0, -ESC_k,
|
|
0, 0,
|
|
ESC_n, 0,
|
|
-ESC_p, 0,
|
|
ESC_r, -ESC_s,
|
|
ESC_tee, 0,
|
|
-ESC_v, -ESC_w,
|
|
0, 0,
|
|
-ESC_z
|
|
};
|
|
|
|
#else
|
|
|
|
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
|
|
|
|
static const short int escapes[] = {
|
|
/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
|
|
/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
|
|
/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
|
|
/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
|
|
/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
|
|
/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
|
|
/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
|
|
/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
|
|
/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p,
|
|
/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
|
|
/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
|
|
/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
|
|
/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
|
|
/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
|
|
/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
|
|
/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
|
|
/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
|
|
/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
|
|
/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
|
|
/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
|
|
/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
|
|
};
|
|
|
|
/* We also need a table of characters that may follow \c in an EBCDIC
|
|
environment for characters 0-31. */
|
|
|
|
static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
|
|
|
|
#endif
|
|
|
|
|
|
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
|
|
searched linearly. Put all the names into a single string, in order to reduce
|
|
the number of relocations when a shared library is dynamically linked. The
|
|
string is built from string macros so that it works in UTF-8 mode on EBCDIC
|
|
platforms. */
|
|
|
|
typedef struct verbitem {
|
|
int len; /* Length of verb name */
|
|
int op; /* Op when no arg, or -1 if arg mandatory */
|
|
int op_arg; /* Op when arg present, or -1 if not allowed */
|
|
} verbitem;
|
|
|
|
static const char verbnames[] =
|
|
"\0" /* Empty name is a shorthand for MARK */
|
|
STRING_MARK0
|
|
STRING_ACCEPT0
|
|
STRING_COMMIT0
|
|
STRING_F0
|
|
STRING_FAIL0
|
|
STRING_PRUNE0
|
|
STRING_SKIP0
|
|
STRING_THEN;
|
|
|
|
static const verbitem verbs[] = {
|
|
{ 0, -1, OP_MARK },
|
|
{ 4, -1, OP_MARK },
|
|
{ 6, OP_ACCEPT, -1 },
|
|
{ 6, OP_COMMIT, -1 },
|
|
{ 1, OP_FAIL, -1 },
|
|
{ 4, OP_FAIL, -1 },
|
|
{ 5, OP_PRUNE, OP_PRUNE_ARG },
|
|
{ 4, OP_SKIP, OP_SKIP_ARG },
|
|
{ 4, OP_THEN, OP_THEN_ARG }
|
|
};
|
|
|
|
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
|
|
|
|
|
|
/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
|
|
another regex library. */
|
|
|
|
static const pcre_uchar sub_start_of_word[] = {
|
|
CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
|
|
CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
|
|
|
|
static const pcre_uchar sub_end_of_word[] = {
|
|
CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
|
|
CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
|
|
CHAR_RIGHT_PARENTHESIS, '\0' };
|
|
|
|
|
|
/* Tables of names of POSIX character classes and their lengths. The names are
|
|
now all in a single string, to reduce the number of relocations when a shared
|
|
library is dynamically loaded. The list of lengths is terminated by a zero
|
|
length entry. The first three must be alpha, lower, upper, as this is assumed
|
|
for handling case independence. The indices for graph, print, and punct are
|
|
needed, so identify them. */
|
|
|
|
static const char posix_names[] =
|
|
STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
|
|
STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
|
|
STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
|
|
STRING_word0 STRING_xdigit;
|
|
|
|
static const pcre_uint8 posix_name_lengths[] = {
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
|
|
|
|
#define PC_GRAPH 8
|
|
#define PC_PRINT 9
|
|
#define PC_PUNCT 10
|
|
|
|
|
|
/* Table of class bit maps for each POSIX class. Each class is formed from a
|
|
base map, with an optional addition or removal of another map. Then, for some
|
|
classes, there is some additional tweaking: for [:blank:] the vertical space
|
|
characters are removed, and for [:alpha:] and [:alnum:] the underscore
|
|
character is removed. The triples in the table consist of the base map offset,
|
|
second map offset or -1 if no second map, and a non-negative value for map
|
|
addition or a negative value for map subtraction (if there are two maps). The
|
|
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
|
|
remove vertical space characters, 2 => remove underscore. */
|
|
|
|
static const int posix_class_maps[] = {
|
|
cbit_word, cbit_digit, -2, /* alpha */
|
|
cbit_lower, -1, 0, /* lower */
|
|
cbit_upper, -1, 0, /* upper */
|
|
cbit_word, -1, 2, /* alnum - word without underscore */
|
|
cbit_print, cbit_cntrl, 0, /* ascii */
|
|
cbit_space, -1, 1, /* blank - a GNU extension */
|
|
cbit_cntrl, -1, 0, /* cntrl */
|
|
cbit_digit, -1, 0, /* digit */
|
|
cbit_graph, -1, 0, /* graph */
|
|
cbit_print, -1, 0, /* print */
|
|
cbit_punct, -1, 0, /* punct */
|
|
cbit_space, -1, 0, /* space */
|
|
cbit_word, -1, 0, /* word - a Perl extension */
|
|
cbit_xdigit,-1, 0 /* xdigit */
|
|
};
|
|
|
|
/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
|
|
Unicode property escapes. */
|
|
|
|
#ifdef SUPPORT_UCP
|
|
static const pcre_uchar string_PNd[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pNd[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PXsp[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pXsp[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PXwd[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pXwd[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
|
|
static const pcre_uchar *substitutes[] = {
|
|
string_PNd, /* \D */
|
|
string_pNd, /* \d */
|
|
string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */
|
|
string_pXsp, /* \s */ /* space and POSIX space are the same. */
|
|
string_PXwd, /* \W */
|
|
string_pXwd /* \w */
|
|
};
|
|
|
|
/* The POSIX class substitutes must be in the order of the POSIX class names,
|
|
defined above, and there are both positive and negative cases. NULL means no
|
|
general substitute of a Unicode property escape (\p or \P). However, for some
|
|
POSIX classes (e.g. graph, print, punct) a special property code is compiled
|
|
directly. */
|
|
|
|
static const pcre_uchar string_pL[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pLl[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pLu[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_pXan[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_h[] = {
|
|
CHAR_BACKSLASH, CHAR_h, '\0' };
|
|
static const pcre_uchar string_pXps[] = {
|
|
CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PL[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PLl[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PLu[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_PXan[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
static const pcre_uchar string_H[] = {
|
|
CHAR_BACKSLASH, CHAR_H, '\0' };
|
|
static const pcre_uchar string_PXps[] = {
|
|
CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
|
|
CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
|
|
|
|
static const pcre_uchar *posix_substitutes[] = {
|
|
string_pL, /* alpha */
|
|
string_pLl, /* lower */
|
|
string_pLu, /* upper */
|
|
string_pXan, /* alnum */
|
|
NULL, /* ascii */
|
|
string_h, /* blank */
|
|
NULL, /* cntrl */
|
|
string_pNd, /* digit */
|
|
NULL, /* graph */
|
|
NULL, /* print */
|
|
NULL, /* punct */
|
|
string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */
|
|
string_pXwd, /* word */ /* Perl and POSIX space are the same */
|
|
NULL, /* xdigit */
|
|
/* Negated cases */
|
|
string_PL, /* ^alpha */
|
|
string_PLl, /* ^lower */
|
|
string_PLu, /* ^upper */
|
|
string_PXan, /* ^alnum */
|
|
NULL, /* ^ascii */
|
|
string_H, /* ^blank */
|
|
NULL, /* ^cntrl */
|
|
string_PNd, /* ^digit */
|
|
NULL, /* ^graph */
|
|
NULL, /* ^print */
|
|
NULL, /* ^punct */
|
|
string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */
|
|
string_PXwd, /* ^word */ /* Perl and POSIX space are the same */
|
|
NULL /* ^xdigit */
|
|
};
|
|
#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
|
|
#endif
|
|
|
|
#define STRING(a) # a
|
|
#define XSTRING(s) STRING(s)
|
|
|
|
/* The texts of compile-time error messages. These are "char *" because they
|
|
are passed to the outside world. Do not ever re-use any error number, because
|
|
they are documented. Always add a new error instead. Messages marked DEAD below
|
|
are no longer used. This used to be a table of strings, but in order to reduce
|
|
the number of relocations needed when a shared library is loaded dynamically,
|
|
it is now one long string. We cannot use a table of offsets, because the
|
|
lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
|
|
simply count through to the one we want - this isn't a performance issue
|
|
because these strings are used only when there is a compilation error.
|
|
|
|
Each substring ends with \0 to insert a null character. This includes the final
|
|
substring, so that the whole string ends with \0\0, which can be detected when
|
|
counting through. */
|
|
|
|
static const char error_texts[] =
|
|
"no error\0"
|
|
"\\ at end of pattern\0"
|
|
"\\c at end of pattern\0"
|
|
"unrecognized character follows \\\0"
|
|
"numbers out of order in {} quantifier\0"
|
|
/* 5 */
|
|
"number too big in {} quantifier\0"
|
|
"missing terminating ] for character class\0"
|
|
"invalid escape sequence in character class\0"
|
|
"range out of order in character class\0"
|
|
"nothing to repeat\0"
|
|
/* 10 */
|
|
"internal error: invalid forward reference offset\0"
|
|
"internal error: unexpected repeat\0"
|
|
"unrecognized character after (? or (?-\0"
|
|
"POSIX named classes are supported only within a class\0"
|
|
"missing )\0"
|
|
/* 15 */
|
|
"reference to non-existent subpattern\0"
|
|
"erroffset passed as NULL\0"
|
|
"unknown option bit(s) set\0"
|
|
"missing ) after comment\0"
|
|
"parentheses nested too deeply\0" /** DEAD **/
|
|
/* 20 */
|
|
"regular expression is too large\0"
|
|
"failed to get memory\0"
|
|
"unmatched parentheses\0"
|
|
"internal error: code overflow\0"
|
|
"unrecognized character after (?<\0"
|
|
/* 25 */
|
|
"lookbehind assertion is not fixed length\0"
|
|
"malformed number or name after (?(\0"
|
|
"conditional group contains more than two branches\0"
|
|
"assertion expected after (?( or (?(?C)\0"
|
|
"(?R or (?[+-]digits must be followed by )\0"
|
|
/* 30 */
|
|
"unknown POSIX class name\0"
|
|
"POSIX collating elements are not supported\0"
|
|
"this version of PCRE is compiled without UTF support\0"
|
|
"spare error\0" /** DEAD **/
|
|
"character value in \\x{} or \\o{} is too large\0"
|
|
/* 35 */
|
|
"invalid condition (?(0)\0"
|
|
"\\C not allowed in lookbehind assertion\0"
|
|
"PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
|
|
"number after (?C is > 255\0"
|
|
"closing ) for (?C expected\0"
|
|
/* 40 */
|
|
"recursive call could loop indefinitely\0"
|
|
"unrecognized character after (?P\0"
|
|
"syntax error in subpattern name (missing terminator)\0"
|
|
"two named subpatterns have the same name\0"
|
|
"invalid UTF-8 string\0"
|
|
/* 45 */
|
|
"support for \\P, \\p, and \\X has not been compiled\0"
|
|
"malformed \\P or \\p sequence\0"
|
|
"unknown property name after \\P or \\p\0"
|
|
"subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
|
|
"too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
|
|
/* 50 */
|
|
"repeated subpattern is too long\0" /** DEAD **/
|
|
"octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
|
|
"internal error: overran compiling workspace\0"
|
|
"internal error: previously-checked referenced subpattern not found\0"
|
|
"DEFINE group contains more than one branch\0"
|
|
/* 55 */
|
|
"repeating a DEFINE group is not allowed\0" /** DEAD **/
|
|
"inconsistent NEWLINE options\0"
|
|
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
|
|
"a numbered reference must not be zero\0"
|
|
"an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
|
|
/* 60 */
|
|
"(*VERB) not recognized or malformed\0"
|
|
"number is too big\0"
|
|
"subpattern name expected\0"
|
|
"digit expected after (?+\0"
|
|
"] is an invalid data character in JavaScript compatibility mode\0"
|
|
/* 65 */
|
|
"different names for subpatterns of the same number are not allowed\0"
|
|
"(*MARK) must have an argument\0"
|
|
"this version of PCRE is not compiled with Unicode property support\0"
|
|
#ifndef EBCDIC
|
|
"\\c must be followed by an ASCII character\0"
|
|
#else
|
|
"\\c must be followed by a letter or one of [\\]^_?\0"
|
|
#endif
|
|
"\\k is not followed by a braced, angle-bracketed, or quoted name\0"
|
|
/* 70 */
|
|
"internal error: unknown opcode in find_fixedlength()\0"
|
|
"\\N is not supported in a class\0"
|
|
"too many forward references\0"
|
|
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
|
|
"invalid UTF-16 string\0"
|
|
/* 75 */
|
|
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
|
|
"character value in \\u.... sequence is too large\0"
|
|
"invalid UTF-32 string\0"
|
|
"setting UTF is disabled by the application\0"
|
|
"non-hex character in \\x{} (closing brace missing?)\0"
|
|
/* 80 */
|
|
"non-octal character in \\o{} (closing brace missing?)\0"
|
|
"missing opening brace after \\o\0"
|
|
"parentheses are too deeply nested\0"
|
|
"invalid range in character class\0"
|
|
"group name must start with a non-digit\0"
|
|
/* 85 */
|
|
"parentheses are too deeply nested (stack check)\0"
|
|
"digits missing in \\x{} or \\o{}\0"
|
|
"regular expression is too complicated\0"
|
|
;
|
|
|
|
/* Table to identify digits and hex digits. This is used when compiling
|
|
patterns. Note that the tables in chartables are dependent on the locale, and
|
|
may mark arbitrary characters as digits - but the PCRE compiling code expects
|
|
to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
|
|
a private table here. It costs 256 bytes, but it is a lot faster than doing
|
|
character value tests (at least in some simple cases I timed), and in some
|
|
applications one wants PCRE to compile efficiently as well as match
|
|
efficiently.
|
|
|
|
For convenience, we use the same bit definitions as in chartables:
|
|
|
|
0x04 decimal digit
|
|
0x08 hexadecimal digit
|
|
|
|
Then we can use ctype_digit and ctype_xdigit in the code. */
|
|
|
|
/* Using a simple comparison for decimal numbers rather than a memory read
|
|
is much faster, and the resulting code is simpler (the compiler turns it
|
|
into a subtraction and unsigned comparison). */
|
|
|
|
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
|
|
|
|
#ifndef EBCDIC
|
|
|
|
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
|
|
UTF-8 mode. */
|
|
|
|
static const pcre_uint8 digitab[] =
|
|
{
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
|
|
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
|
|
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
|
|
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
|
|
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
|
|
|
|
#else
|
|
|
|
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
|
|
|
|
static const pcre_uint8 digitab[] =
|
|
{
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
|
|
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
|
|
0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
|
|
0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
|
|
0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
|
|
|
|
static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
|
|
0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
|
|
0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
|
|
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
|
|
0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
|
|
0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
|
|
0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
|
|
0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
|
|
0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
|
|
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
|
|
0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
|
|
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
|
|
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
|
|
0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
|
|
0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
|
|
0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
|
|
0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
|
|
0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
|
|
0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
|
|
0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
|
|
0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
|
|
#endif
|
|
|
|
|
|
/* This table is used to check whether auto-possessification is possible
|
|
between adjacent character-type opcodes. The left-hand (repeated) opcode is
|
|
used to select the row, and the right-hand opcode is use to select the column.
|
|
A value of 1 means that auto-possessification is OK. For example, the second
|
|
value in the first row means that \D+\d can be turned into \D++\d.
|
|
|
|
The Unicode property types (\P and \p) have to be present to fill out the table
|
|
because of what their opcode values are, but the table values should always be
|
|
zero because property types are handled separately in the code. The last four
|
|
columns apply to items that cannot be repeated, so there is no need to have
|
|
rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
|
|
*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
|
|
|
|
#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
|
|
#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
|
|
|
|
static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
|
|
/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */
|
|
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */
|
|
{ 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */
|
|
{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */
|
|
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */
|
|
{ 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */
|
|
{ 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */
|
|
{ 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */
|
|
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */
|
|
{ 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */
|
|
};
|
|
|
|
|
|
/* This table is used to check whether auto-possessification is possible
|
|
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
|
|
left-hand (repeated) opcode is used to select the row, and the right-hand
|
|
opcode is used to select the column. The values are as follows:
|
|
|
|
0 Always return FALSE (never auto-possessify)
|
|
1 Character groups are distinct (possessify if both are OP_PROP)
|
|
2 Check character categories in the same group (general or particular)
|
|
3 TRUE if the two opcodes are not the same (PROP vs NOTPROP)
|
|
|
|
4 Check left general category vs right particular category
|
|
5 Check right general category vs left particular category
|
|
|
|
6 Left alphanum vs right general category
|
|
7 Left space vs right general category
|
|
8 Left word vs right general category
|
|
|
|
9 Right alphanum vs left general category
|
|
10 Right space vs left general category
|
|
11 Right word vs left general category
|
|
|
|
12 Left alphanum vs right particular category
|
|
13 Left space vs right particular category
|
|
14 Left word vs right particular category
|
|
|
|
15 Right alphanum vs left particular category
|
|
16 Right space vs left particular category
|
|
17 Right word vs left particular category
|
|
*/
|
|
|
|
static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
|
|
/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */
|
|
{ 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */
|
|
{ 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */
|
|
{ 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */
|
|
{ 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */
|
|
{ 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */
|
|
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */
|
|
{ 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */
|
|
{ 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */
|
|
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */
|
|
};
|
|
|
|
/* This table is used to check whether auto-possessification is possible
|
|
between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
|
|
specifies a general category and the other specifies a particular category. The
|
|
row is selected by the general category and the column by the particular
|
|
category. The value is 1 if the particular category is not part of the general
|
|
category. */
|
|
|
|
static const pcre_uint8 catposstab[7][30] = {
|
|
/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
|
|
{ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */
|
|
{ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */
|
|
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */
|
|
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */
|
|
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */
|
|
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */
|
|
{ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */
|
|
};
|
|
|
|
/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
|
|
a general or particular category. The properties in each row are those
|
|
that apply to the character set in question. Duplication means that a little
|
|
unnecessary work is done when checking, but this keeps things much simpler
|
|
because they can all use the same code. For more details see the comment where
|
|
this table is used.
|
|
|
|
Note: SPACE and PXSPACE used to be different because Perl excluded VT from
|
|
"space", but from Perl 5.18 it's included, so both categories are treated the
|
|
same here. */
|
|
|
|
static const pcre_uint8 posspropstab[3][4] = {
|
|
{ ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */
|
|
{ ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */
|
|
{ ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */
|
|
};
|
|
|
|
/* This table is used when converting repeating opcodes into possessified
|
|
versions as a result of an explicit possessive quantifier such as ++. A zero
|
|
value means there is no possessified version - in those cases the item in
|
|
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
|
|
because all relevant opcodes are less than that. */
|
|
|
|
static const pcre_uint8 opcode_possessify[] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */
|
|
|
|
0, /* NOTI */
|
|
OP_POSSTAR, 0, /* STAR, MINSTAR */
|
|
OP_POSPLUS, 0, /* PLUS, MINPLUS */
|
|
OP_POSQUERY, 0, /* QUERY, MINQUERY */
|
|
OP_POSUPTO, 0, /* UPTO, MINUPTO */
|
|
0, /* EXACT */
|
|
0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */
|
|
|
|
OP_POSSTARI, 0, /* STARI, MINSTARI */
|
|
OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */
|
|
OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */
|
|
OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */
|
|
0, /* EXACTI */
|
|
0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */
|
|
|
|
OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */
|
|
OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */
|
|
OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */
|
|
OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */
|
|
0, /* NOTEXACT */
|
|
0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
|
|
|
|
OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */
|
|
OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */
|
|
OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */
|
|
OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */
|
|
0, /* NOTEXACTI */
|
|
0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
|
|
|
|
OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */
|
|
OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */
|
|
OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */
|
|
OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */
|
|
0, /* TYPEEXACT */
|
|
0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
|
|
|
|
OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */
|
|
OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */
|
|
OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */
|
|
OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */
|
|
0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */
|
|
|
|
0, 0, 0, /* CLASS, NCLASS, XCLASS */
|
|
0, 0, /* REF, REFI */
|
|
0, 0, /* DNREF, DNREFI */
|
|
0, 0 /* RECURSE, CALLOUT */
|
|
};
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Find an error text *
|
|
*************************************************/
|
|
|
|
/* The error texts are now all in one long string, to save on relocations. As
|
|
some of the text is of unknown length, we can't use a table of offsets.
|
|
Instead, just count through the strings. This is not a performance issue
|
|
because it happens only when there has been a compilation error.
|
|
|
|
Argument: the error number
|
|
Returns: pointer to the error string
|
|
*/
|
|
|
|
static const char *
|
|
find_error_text(int n)
|
|
{
|
|
const char *s = error_texts;
|
|
for (; n > 0; n--)
|
|
{
|
|
while (*s++ != CHAR_NULL) {};
|
|
if (*s == CHAR_NULL) return "Error text not found (please report)";
|
|
}
|
|
return s;
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Expand the workspace *
|
|
*************************************************/
|
|
|
|
/* This function is called during the second compiling phase, if the number of
|
|
forward references fills the existing workspace, which is originally a block on
|
|
the stack. A larger block is obtained from malloc() unless the ultimate limit
|
|
has been reached or the increase will be rather small.
|
|
|
|
Argument: pointer to the compile data block
|
|
Returns: 0 if all went well, else an error number
|
|
*/
|
|
|
|
static int
|
|
expand_workspace(compile_data *cd)
|
|
{
|
|
pcre_uchar *newspace;
|
|
int newsize = cd->workspace_size * 2;
|
|
|
|
if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
|
|
if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
|
|
newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
|
|
return ERR72;
|
|
|
|
newspace = (PUBL(malloc))(IN_UCHARS(newsize));
|
|
if (newspace == NULL) return ERR21;
|
|
memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
|
|
cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
|
|
if (cd->workspace_size > COMPILE_WORK_SIZE)
|
|
(PUBL(free))((void *)cd->start_workspace);
|
|
cd->start_workspace = newspace;
|
|
cd->workspace_size = newsize;
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Check for counted repeat *
|
|
*************************************************/
|
|
|
|
/* This function is called when a '{' is encountered in a place where it might
|
|
start a quantifier. It looks ahead to see if it really is a quantifier or not.
|
|
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
|
|
where the ddds are digits.
|
|
|
|
Arguments:
|
|
p pointer to the first char after '{'
|
|
|
|
Returns: TRUE or FALSE
|
|
*/
|
|
|
|
static BOOL
|
|
is_counted_repeat(const pcre_uchar *p)
|
|
{
|
|
if (!IS_DIGIT(*p)) return FALSE;
|
|
p++;
|
|
while (IS_DIGIT(*p)) p++;
|
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
|
|
|
|
if (*p++ != CHAR_COMMA) return FALSE;
|
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
|
|
|
|
if (!IS_DIGIT(*p)) return FALSE;
|
|
p++;
|
|
while (IS_DIGIT(*p)) p++;
|
|
|
|
return (*p == CHAR_RIGHT_CURLY_BRACKET);
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Handle escapes *
|
|
*************************************************/
|
|
|
|
/* This function is called when a \ has been encountered. It either returns a
|
|
positive value for a simple escape such as \n, or 0 for a data character which
|
|
will be placed in chptr. A backreference to group n is returned as negative n.
|
|
When UTF-8 is enabled, a positive value greater than 255 may be returned in
|
|
chptr. On entry, ptr is pointing at the \. On exit, it is on the final
|
|
character of the escape sequence.
|
|
|
|
Arguments:
|
|
ptrptr points to the pattern position pointer
|
|
chptr points to a returned data character
|
|
errorcodeptr points to the errorcode variable
|
|
bracount number of previous extracting brackets
|
|
options the options bits
|
|
isclass TRUE if inside a character class
|
|
|
|
Returns: zero => a data character
|
|
positive => a special escape sequence
|
|
negative => a back reference
|
|
on error, errorcodeptr is set
|
|
*/
|
|
|
|
static int
|
|
check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
|
|
int bracount, int options, BOOL isclass)
|
|
{
|
|
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
|
|
BOOL utf = (options & PCRE_UTF8) != 0;
|
|
const pcre_uchar *ptr = *ptrptr + 1;
|
|
pcre_uint32 c;
|
|
int escape = 0;
|
|
int i;
|
|
|
|
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
|
|
ptr--; /* Set pointer back to the last byte */
|
|
|
|
/* If backslash is at the end of the pattern, it's an error. */
|
|
|
|
if (c == CHAR_NULL) *errorcodeptr = ERR1;
|
|
|
|
/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
|
|
in a table. A non-zero result is something that can be returned immediately.
|
|
Otherwise further processing may be required. */
|
|
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
/* Not alphanumeric */
|
|
else if (c < CHAR_0 || c > CHAR_z) {}
|
|
else if ((i = escapes[c - CHAR_0]) != 0)
|
|
{ if (i > 0) c = (pcre_uint32)i; else escape = -i; }
|
|
|
|
#else /* EBCDIC coding */
|
|
/* Not alphanumeric */
|
|
else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
|
|
else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
|
|
#endif
|
|
|
|
/* Escapes that need further processing, or are illegal. */
|
|
|
|
else
|
|
{
|
|
const pcre_uchar *oldptr;
|
|
BOOL braced, negated, overflow;
|
|
int s;
|
|
|
|
switch (c)
|
|
{
|
|
/* A number of Perl escapes are not handled by PCRE. We give an explicit
|
|
error. */
|
|
|
|
case CHAR_l:
|
|
case CHAR_L:
|
|
*errorcodeptr = ERR37;
|
|
break;
|
|
|
|
case CHAR_u:
|
|
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
|
|
{
|
|
/* In JavaScript, \u must be followed by four hexadecimal numbers.
|
|
Otherwise it is a lowercase u letter. */
|
|
if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
|
|
&& MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
|
|
&& MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
|
|
&& MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
|
|
{
|
|
c = 0;
|
|
for (i = 0; i < 4; ++i)
|
|
{
|
|
register pcre_uint32 cc = *(++ptr);
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
|
|
#else /* EBCDIC coding */
|
|
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
|
|
#endif
|
|
}
|
|
|
|
#if defined COMPILE_PCRE8
|
|
if (c > (utf ? 0x10ffffU : 0xffU))
|
|
#elif defined COMPILE_PCRE16
|
|
if (c > (utf ? 0x10ffffU : 0xffffU))
|
|
#elif defined COMPILE_PCRE32
|
|
if (utf && c > 0x10ffffU)
|
|
#endif
|
|
{
|
|
*errorcodeptr = ERR76;
|
|
}
|
|
else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
|
|
}
|
|
}
|
|
else
|
|
*errorcodeptr = ERR37;
|
|
break;
|
|
|
|
case CHAR_U:
|
|
/* In JavaScript, \U is an uppercase U letter. */
|
|
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
|
|
break;
|
|
|
|
/* In a character class, \g is just a literal "g". Outside a character
|
|
class, \g must be followed by one of a number of specific things:
|
|
|
|
(1) A number, either plain or braced. If positive, it is an absolute
|
|
backreference. If negative, it is a relative backreference. This is a Perl
|
|
5.10 feature.
|
|
|
|
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
|
|
is part of Perl's movement towards a unified syntax for back references. As
|
|
this is synonymous with \k{name}, we fudge it up by pretending it really
|
|
was \k.
|
|
|
|
(3) For Oniguruma compatibility we also support \g followed by a name or a
|
|
number either in angle brackets or in single quotes. However, these are
|
|
(possibly recursive) subroutine calls, _not_ backreferences. Just return
|
|
the ESC_g code (cf \k). */
|
|
|
|
case CHAR_g:
|
|
if (isclass) break;
|
|
if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
|
|
{
|
|
escape = ESC_g;
|
|
break;
|
|
}
|
|
|
|
/* Handle the Perl-compatible cases */
|
|
|
|
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
|
|
{
|
|
const pcre_uchar *p;
|
|
for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
|
|
if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
|
|
if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
escape = ESC_k;
|
|
break;
|
|
}
|
|
braced = TRUE;
|
|
ptr++;
|
|
}
|
|
else braced = FALSE;
|
|
|
|
if (ptr[1] == CHAR_MINUS)
|
|
{
|
|
negated = TRUE;
|
|
ptr++;
|
|
}
|
|
else negated = FALSE;
|
|
|
|
/* The integer range is limited by the machine's int representation. */
|
|
s = 0;
|
|
overflow = FALSE;
|
|
while (IS_DIGIT(ptr[1]))
|
|
{
|
|
if (s > INT_MAX / 10 - 1) /* Integer overflow */
|
|
{
|
|
overflow = TRUE;
|
|
break;
|
|
}
|
|
s = s * 10 + (int)(*(++ptr) - CHAR_0);
|
|
}
|
|
if (overflow) /* Integer overflow */
|
|
{
|
|
while (IS_DIGIT(ptr[1]))
|
|
ptr++;
|
|
*errorcodeptr = ERR61;
|
|
break;
|
|
}
|
|
|
|
if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
*errorcodeptr = ERR57;
|
|
break;
|
|
}
|
|
|
|
if (s == 0)
|
|
{
|
|
*errorcodeptr = ERR58;
|
|
break;
|
|
}
|
|
|
|
if (negated)
|
|
{
|
|
if (s > bracount)
|
|
{
|
|
*errorcodeptr = ERR15;
|
|
break;
|
|
}
|
|
s = bracount - (s - 1);
|
|
}
|
|
|
|
escape = -s;
|
|
break;
|
|
|
|
/* The handling of escape sequences consisting of a string of digits
|
|
starting with one that is not zero is not straightforward. Perl has changed
|
|
over the years. Nowadays \g{} for backreferences and \o{} for octal are
|
|
recommended to avoid the ambiguities in the old syntax.
|
|
|
|
Outside a character class, the digits are read as a decimal number. If the
|
|
number is less than 8 (used to be 10), or if there are that many previous
|
|
extracting left brackets, then it is a back reference. Otherwise, up to
|
|
three octal digits are read to form an escaped byte. Thus \123 is likely to
|
|
be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
|
|
the octal value is greater than 377, the least significant 8 bits are
|
|
taken. \8 and \9 are treated as the literal characters 8 and 9.
|
|
|
|
Inside a character class, \ followed by a digit is always either a literal
|
|
8 or 9 or an octal number. */
|
|
|
|
case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
|
|
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
|
|
|
|
if (!isclass)
|
|
{
|
|
oldptr = ptr;
|
|
/* The integer range is limited by the machine's int representation. */
|
|
s = (int)(c -CHAR_0);
|
|
overflow = FALSE;
|
|
while (IS_DIGIT(ptr[1]))
|
|
{
|
|
if (s > INT_MAX / 10 - 1) /* Integer overflow */
|
|
{
|
|
overflow = TRUE;
|
|
break;
|
|
}
|
|
s = s * 10 + (int)(*(++ptr) - CHAR_0);
|
|
}
|
|
if (overflow) /* Integer overflow */
|
|
{
|
|
while (IS_DIGIT(ptr[1]))
|
|
ptr++;
|
|
*errorcodeptr = ERR61;
|
|
break;
|
|
}
|
|
if (s < 8 || s <= bracount) /* Check for back reference */
|
|
{
|
|
escape = -s;
|
|
break;
|
|
}
|
|
ptr = oldptr; /* Put the pointer back and fall through */
|
|
}
|
|
|
|
/* Handle a digit following \ when the number is not a back reference. If
|
|
the first digit is 8 or 9, Perl used to generate a binary zero byte and
|
|
then treat the digit as a following literal. At least by Perl 5.18 this
|
|
changed so as not to insert the binary zero. */
|
|
|
|
if ((c = *ptr) >= CHAR_8) break;
|
|
|
|
/* Fall through with a digit less than 8 */
|
|
|
|
/* \0 always starts an octal number, but we may drop through to here with a
|
|
larger first octal digit. The original code used just to take the least
|
|
significant 8 bits of octal numbers (I think this is what early Perls used
|
|
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
|
|
but no more than 3 octal digits. */
|
|
|
|
case CHAR_0:
|
|
c -= CHAR_0;
|
|
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
|
|
c = c * 8 + *(++ptr) - CHAR_0;
|
|
#ifdef COMPILE_PCRE8
|
|
if (!utf && c > 0xff) *errorcodeptr = ERR51;
|
|
#endif
|
|
break;
|
|
|
|
/* \o is a relatively new Perl feature, supporting a more general way of
|
|
specifying character codes in octal. The only supported form is \o{ddd}. */
|
|
|
|
case CHAR_o:
|
|
if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
|
|
if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
|
|
{
|
|
ptr += 2;
|
|
c = 0;
|
|
overflow = FALSE;
|
|
while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
|
|
{
|
|
register pcre_uint32 cc = *ptr++;
|
|
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
|
|
#ifdef COMPILE_PCRE32
|
|
if (c >= 0x20000000l) { overflow = TRUE; break; }
|
|
#endif
|
|
c = (c << 3) + cc - CHAR_0 ;
|
|
#if defined COMPILE_PCRE8
|
|
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
|
|
#elif defined COMPILE_PCRE16
|
|
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
|
|
#elif defined COMPILE_PCRE32
|
|
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
|
|
#endif
|
|
}
|
|
if (overflow)
|
|
{
|
|
while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
|
|
*errorcodeptr = ERR34;
|
|
}
|
|
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
|
|
}
|
|
else *errorcodeptr = ERR80;
|
|
}
|
|
break;
|
|
|
|
/* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
|
|
numbers. Otherwise it is a lowercase x letter. */
|
|
|
|
case CHAR_x:
|
|
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
|
|
{
|
|
if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
|
|
&& MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
|
|
{
|
|
c = 0;
|
|
for (i = 0; i < 2; ++i)
|
|
{
|
|
register pcre_uint32 cc = *(++ptr);
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
|
|
#else /* EBCDIC coding */
|
|
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
|
|
#endif
|
|
}
|
|
}
|
|
} /* End JavaScript handling */
|
|
|
|
/* Handle \x in Perl's style. \x{ddd} is a character number which can be
|
|
greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
|
|
digits. If not, { used to be treated as a data character. However, Perl
|
|
seems to read hex digits up to the first non-such, and ignore the rest, so
|
|
that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
|
|
now gives an error. */
|
|
|
|
else
|
|
{
|
|
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
|
|
{
|
|
ptr += 2;
|
|
if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
*errorcodeptr = ERR86;
|
|
break;
|
|
}
|
|
c = 0;
|
|
overflow = FALSE;
|
|
while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
|
|
{
|
|
register pcre_uint32 cc = *ptr++;
|
|
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
|
|
|
|
#ifdef COMPILE_PCRE32
|
|
if (c >= 0x10000000l) { overflow = TRUE; break; }
|
|
#endif
|
|
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
|
|
#else /* EBCDIC coding */
|
|
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
|
|
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
|
|
#endif
|
|
|
|
#if defined COMPILE_PCRE8
|
|
if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
|
|
#elif defined COMPILE_PCRE16
|
|
if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
|
|
#elif defined COMPILE_PCRE32
|
|
if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
|
|
#endif
|
|
}
|
|
|
|
if (overflow)
|
|
{
|
|
while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
|
|
*errorcodeptr = ERR34;
|
|
}
|
|
|
|
else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
|
|
}
|
|
|
|
/* If the sequence of hex digits does not end with '}', give an error.
|
|
We used just to recognize this construct and fall through to the normal
|
|
\x handling, but nowadays Perl gives an error, which seems much more
|
|
sensible, so we do too. */
|
|
|
|
else *errorcodeptr = ERR79;
|
|
} /* End of \x{} processing */
|
|
|
|
/* Read a single-byte hex-defined char (up to two hex digits after \x) */
|
|
|
|
else
|
|
{
|
|
c = 0;
|
|
while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
|
|
{
|
|
pcre_uint32 cc; /* Some compilers don't like */
|
|
cc = *(++ptr); /* ++ in initializers */
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
|
|
c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
|
|
#else /* EBCDIC coding */
|
|
if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
|
|
c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
|
|
#endif
|
|
}
|
|
} /* End of \xdd handling */
|
|
} /* End of Perl-style \x handling */
|
|
break;
|
|
|
|
/* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
|
|
An error is given if the byte following \c is not an ASCII character. This
|
|
coding is ASCII-specific, but then the whole concept of \cx is
|
|
ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
|
|
|
|
case CHAR_c:
|
|
c = *(++ptr);
|
|
if (c == CHAR_NULL)
|
|
{
|
|
*errorcodeptr = ERR2;
|
|
break;
|
|
}
|
|
#ifndef EBCDIC /* ASCII/UTF-8 coding */
|
|
if (c > 127) /* Excludes all non-ASCII in either mode */
|
|
{
|
|
*errorcodeptr = ERR68;
|
|
break;
|
|
}
|
|
if (c >= CHAR_a && c <= CHAR_z) c -= 32;
|
|
c ^= 0x40;
|
|
#else /* EBCDIC coding */
|
|
if (c >= CHAR_a && c <= CHAR_z) c += 64;
|
|
if (c == CHAR_QUESTION_MARK)
|
|
c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
|
|
else
|
|
{
|
|
for (i = 0; i < 32; i++)
|
|
{
|
|
if (c == ebcdic_escape_c[i]) break;
|
|
}
|
|
if (i < 32) c = i; else *errorcodeptr = ERR68;
|
|
}
|
|
#endif
|
|
break;
|
|
|
|
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
|
|
other alphanumeric following \ is an error if PCRE_EXTRA was set;
|
|
otherwise, for Perl compatibility, it is a literal. This code looks a bit
|
|
odd, but there used to be some cases other than the default, and there may
|
|
be again in future, so I haven't "optimized" it. */
|
|
|
|
default:
|
|
if ((options & PCRE_EXTRA) != 0) switch(c)
|
|
{
|
|
default:
|
|
*errorcodeptr = ERR3;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Perl supports \N{name} for character names, as well as plain \N for "not
|
|
newline". PCRE does not support \N{name}. However, it does support
|
|
quantification such as \N{2,3}. */
|
|
|
|
if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
|
|
!is_counted_repeat(ptr+2))
|
|
*errorcodeptr = ERR37;
|
|
|
|
/* If PCRE_UCP is set, we change the values for \d etc. */
|
|
|
|
if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
|
|
escape += (ESC_DU - ESC_D);
|
|
|
|
/* Set the pointer to the final character before returning. */
|
|
|
|
*ptrptr = ptr;
|
|
*chptr = c;
|
|
return escape;
|
|
}
|
|
|
|
|
|
|
|
#ifdef SUPPORT_UCP
|
|
/*************************************************
|
|
* Handle \P and \p *
|
|
*************************************************/
|
|
|
|
/* This function is called after \P or \p has been encountered, provided that
|
|
PCRE is compiled with support for Unicode properties. On entry, ptrptr is
|
|
pointing at the P or p. On exit, it is pointing at the final character of the
|
|
escape sequence.
|
|
|
|
Argument:
|
|
ptrptr points to the pattern position pointer
|
|
negptr points to a boolean that is set TRUE for negation else FALSE
|
|
ptypeptr points to an unsigned int that is set to the type value
|
|
pdataptr points to an unsigned int that is set to the detailed property value
|
|
errorcodeptr points to the error code variable
|
|
|
|
Returns: TRUE if the type value was found, or FALSE for an invalid type
|
|
*/
|
|
|
|
static BOOL
|
|
get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
|
|
unsigned int *pdataptr, int *errorcodeptr)
|
|
{
|
|
pcre_uchar c;
|
|
int i, bot, top;
|
|
const pcre_uchar *ptr = *ptrptr;
|
|
pcre_uchar name[32];
|
|
|
|
c = *(++ptr);
|
|
if (c == CHAR_NULL) goto ERROR_RETURN;
|
|
|
|
*negptr = FALSE;
|
|
|
|
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
|
|
negation. */
|
|
|
|
if (c == CHAR_LEFT_CURLY_BRACKET)
|
|
{
|
|
if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
|
|
{
|
|
*negptr = TRUE;
|
|
ptr++;
|
|
}
|
|
for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
|
|
{
|
|
c = *(++ptr);
|
|
if (c == CHAR_NULL) goto ERROR_RETURN;
|
|
if (c == CHAR_RIGHT_CURLY_BRACKET) break;
|
|
name[i] = c;
|
|
}
|
|
if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
|
|
name[i] = 0;
|
|
}
|
|
|
|
/* Otherwise there is just one following character */
|
|
|
|
else
|
|
{
|
|
name[0] = c;
|
|
name[1] = 0;
|
|
}
|
|
|
|
*ptrptr = ptr;
|
|
|
|
/* Search for a recognized property name using binary chop */
|
|
|
|
bot = 0;
|
|
top = PRIV(utt_size);
|
|
|
|
while (bot < top)
|
|
{
|
|
int r;
|
|
i = (bot + top) >> 1;
|
|
r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
|
|
if (r == 0)
|
|
{
|
|
*ptypeptr = PRIV(utt)[i].type;
|
|
*pdataptr = PRIV(utt)[i].value;
|
|
return TRUE;
|
|
}
|
|
if (r > 0) bot = i + 1; else top = i;
|
|
}
|
|
|
|
*errorcodeptr = ERR47;
|
|
*ptrptr = ptr;
|
|
return FALSE;
|
|
|
|
ERROR_RETURN:
|
|
*errorcodeptr = ERR46;
|
|
*ptrptr = ptr;
|
|
return FALSE;
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Read repeat counts *
|
|
*************************************************/
|
|
|
|
/* Read an item of the form {n,m} and return the values. This is called only
|
|
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
|
|
so the syntax is guaranteed to be correct, but we need to check the values.
|
|
|
|
Arguments:
|
|
p pointer to first char after '{'
|
|
minp pointer to int for min
|
|
maxp pointer to int for max
|
|
returned as -1 if no max
|
|
errorcodeptr points to error code variable
|
|
|
|
Returns: pointer to '}' on success;
|
|
current ptr on error, with errorcodeptr set non-zero
|
|
*/
|
|
|
|
static const pcre_uchar *
|
|
read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
|
|
{
|
|
int min = 0;
|
|
int max = -1;
|
|
|
|
while (IS_DIGIT(*p))
|
|
{
|
|
min = min * 10 + (int)(*p++ - CHAR_0);
|
|
if (min > 65535)
|
|
{
|
|
*errorcodeptr = ERR5;
|
|
return p;
|
|
}
|
|
}
|
|
|
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
|
|
{
|
|
if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
|
|
{
|
|
max = 0;
|
|
while(IS_DIGIT(*p))
|
|
{
|
|
max = max * 10 + (int)(*p++ - CHAR_0);
|
|
if (max > 65535)
|
|
{
|
|
*errorcodeptr = ERR5;
|
|
return p;
|
|
}
|
|
}
|
|
if (max < min)
|
|
{
|
|
*errorcodeptr = ERR4;
|
|
return p;
|
|
}
|
|
}
|
|
}
|
|
|
|
*minp = min;
|
|
*maxp = max;
|
|
return p;
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Find first significant op code *
|
|
*************************************************/
|
|
|
|
/* This is called by several functions that scan a compiled expression looking
|
|
for a fixed first character, or an anchoring op code etc. It skips over things
|
|
that do not influence this. For some calls, it makes sense to skip negative
|
|
forward and all backward assertions, and also the \b assertion; for others it
|
|
does not.
|
|
|
|
Arguments:
|
|
code pointer to the start of the group
|
|
skipassert TRUE if certain assertions are to be skipped
|
|
|
|
Returns: pointer to the first significant opcode
|
|
*/
|
|
|
|
static const pcre_uchar*
|
|
first_significant_code(const pcre_uchar *code, BOOL skipassert)
|
|
{
|
|
for (;;)
|
|
{
|
|
switch ((int)*code)
|
|
{
|
|
case OP_ASSERT_NOT:
|
|
case OP_ASSERTBACK:
|
|
case OP_ASSERTBACK_NOT:
|
|
if (!skipassert) return code;
|
|
do code += GET(code, 1); while (*code == OP_ALT);
|
|
code += PRIV(OP_lengths)[*code];
|
|
break;
|
|
|
|
case OP_WORD_BOUNDARY:
|
|
case OP_NOT_WORD_BOUNDARY:
|
|
if (!skipassert) return code;
|
|
/* Fall through */
|
|
|
|
case OP_CALLOUT:
|
|
case OP_CREF:
|
|
case OP_DNCREF:
|
|
case OP_RREF:
|
|
case OP_DNRREF:
|
|
case OP_DEF:
|
|
code += PRIV(OP_lengths)[*code];
|
|
break;
|
|
|
|
default:
|
|
return code;
|
|
}
|
|
}
|
|
/* Control never reaches here */
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Find the fixed length of a branch *
|
|
*************************************************/
|
|
|
|
/* Scan a branch and compute the fixed length of subject that will match it,
|
|
if the length is fixed. This is needed for dealing with backward assertions.
|
|
In UTF8 mode, the result is in characters rather than bytes. The branch is
|
|
temporarily terminated with OP_END when this function is called.
|
|
|
|
This function is called when a backward assertion is encountered, so that if it
|
|
fails, the error message can point to the correct place in the pattern.
|
|
However, we cannot do this when the assertion contains subroutine calls,
|
|
because they can be forward references. We solve this by remembering this case
|
|
and doing the check at the end; a flag specifies which mode we are running in.
|
|
|
|
Arguments:
|
|
code points to the start of the pattern (the bracket)
|
|
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
|
|
atend TRUE if called when the pattern is complete
|
|
cd the "compile data" structure
|
|
recurses chain of recurse_check to catch mutual recursion
|
|
|
|
Returns: the fixed length,
|
|
or -1 if there is no fixed length,
|
|
or -2 if \C was encountered (in UTF-8 mode only)
|
|
or -3 if an OP_RECURSE item was encountered and atend is FALSE
|
|
or -4 if an unknown opcode was encountered (internal error)
|
|
*/
|
|
|
|
static int
|
|
find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
|
|
recurse_check *recurses)
|
|
{
|
|
int length = -1;
|
|
recurse_check this_recurse;
|
|
register int branchlength = 0;
|
|
register pcre_uchar *cc = code + 1 + LINK_SIZE;
|
|
|
|
/* Scan along the opcodes for this branch. If we get to the end of the
|
|
branch, check the length against that of the other branches. */
|
|
|
|
for (;;)
|
|
{
|
|
int d;
|
|
pcre_uchar *ce, *cs;
|
|
register pcre_uchar op = *cc;
|
|
|
|
switch (op)
|
|
{
|
|
/* We only need to continue for OP_CBRA (normal capturing bracket) and
|
|
OP_BRA (normal non-capturing bracket) because the other variants of these
|
|
opcodes are all concerned with unlimited repeated groups, which of course
|
|
are not of fixed length. */
|
|
|
|
case OP_CBRA:
|
|
case OP_BRA:
|
|
case OP_ONCE:
|
|
case OP_ONCE_NC:
|
|
case OP_COND:
|
|
d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
|
|
recurses);
|
|
if (d < 0) return d;
|
|
branchlength += d;
|
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
|
cc += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Reached end of a branch; if it's a ket it is the end of a nested call.
|
|
If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
|
|
an ALT. If it is END it's the end of the outer call. All can be handled by
|
|
the same code. Note that we must not include the OP_KETRxxx opcodes here,
|
|
because they all imply an unlimited repeat. */
|
|
|
|
case OP_ALT:
|
|
case OP_KET:
|
|
case OP_END:
|
|
case OP_ACCEPT:
|
|
case OP_ASSERT_ACCEPT:
|
|
if (length < 0) length = branchlength;
|
|
else if (length != branchlength) return -1;
|
|
if (*cc != OP_ALT) return length;
|
|
cc += 1 + LINK_SIZE;
|
|
branchlength = 0;
|
|
break;
|
|
|
|
/* A true recursion implies not fixed length, but a subroutine call may
|
|
be OK. If the subroutine is a forward reference, we can't deal with
|
|
it until the end of the pattern, so return -3. */
|
|
|
|
case OP_RECURSE:
|
|
if (!atend) return -3;
|
|
cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */
|
|
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */
|
|
if (cc > cs && cc < ce) return -1; /* Recursion */
|
|
else /* Check for mutual recursion */
|
|
{
|
|
recurse_check *r = recurses;
|
|
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
|
|
if (r != NULL) return -1; /* Mutual recursion */
|
|
}
|
|
this_recurse.prev = recurses;
|
|
this_recurse.group = cs;
|
|
d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
|
|
if (d < 0) return d;
|
|
branchlength += d;
|
|
cc += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Skip over assertive subpatterns */
|
|
|
|
case OP_ASSERT:
|
|
case OP_ASSERT_NOT:
|
|
case OP_ASSERTBACK:
|
|
case OP_ASSERTBACK_NOT:
|
|
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
|
cc += 1 + LINK_SIZE;
|
|
break;
|
|
|
|
/* Skip over things that don't match chars */
|
|
|
|
case OP_MARK:
|
|
case OP_PRUNE_ARG:
|
|
case OP_SKIP_ARG:
|
|
case OP_THEN_ARG:
|
|
cc += cc[1] + PRIV(OP_lengths)[*cc];
|
|
break;
|
|
|
|
case OP_CALLOUT:
|
|
case OP_CIRC:
|
|
case OP_CIRCM:
|
|
case OP_CLOSE:
|
|
case OP_COMMIT:
|
|
case OP_CREF:
|
|
case OP_DEF:
|
|
case OP_DNCREF:
|
|
case OP_DNRREF:
|
|
case OP_DOLL:
|
|
case OP_DOLLM:
|
|
case OP_EOD:
|
|
case OP_EODN:
|
|
case OP_FAIL:
|
|
case OP_NOT_WORD_BOUNDARY:
|
|
case OP_PRUNE:
|
|
case OP_REVERSE:
|
|
case OP_RREF:
|
|
case OP_SET_SOM:
|
|
case OP_SKIP:
|
|
case OP_SOD:
|
|
case OP_SOM:
|
|
case OP_THEN:
|
|
case OP_WORD_BOUNDARY:
|
|
cc += PRIV(OP_lengths)[*cc];
|
|
break;
|
|
|
|
/* Handle literal characters */
|
|
|
|
case OP_CHAR:
|
|
case OP_CHARI:
|
|
case OP_NOT:
|
|
case OP_NOTI:
|
|
branchlength++;
|
|
cc += 2;
|
|
#ifdef SUPPORT_UTF
|
|
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
|
|
#endif
|
|
break;
|
|
|
|
/* Handle exact repetitions. The count is already in characters, but we
|
|
need to skip over a multibyte character in UTF8 mode. */
|
|
|
|
case OP_EXACT:
|
|
case OP_EXACTI:
|
|
case OP_NOTEXACT:
|
|
case OP_NOTEXACTI:
|
|
branchlength += (int)GET2(cc,1);
|
|
cc += 2 + IMM2_SIZE;
|
|
#ifdef SUPPORT_UTF
|
|
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
|
|
#endif
|
|
break;
|
|
|
|
case OP_TYPEEXACT:
|
|
branchlength += GET2(cc,1);
|
|
if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
|
|
cc += 2;
|
|
cc += 1 + IMM2_SIZE + 1;
|
|
break;
|
|
|
|
/* Handle single-char matchers */
|
|
|
|
case OP_PROP:
|
|
case OP_NOTPROP:
|
|
cc += 2;
|
|
/* Fall through */
|
|
|
|
case OP_HSPACE:
|
|
case OP_VSPACE:
|
|
case OP_NOT_HSPACE:
|
|
case OP_NOT_VSPACE:
|
|
case OP_NOT_DIGIT:
|
|
case OP_DIGIT:
|
|
case OP_NOT_WHITESPACE:
|
|
case OP_WHITESPACE:
|
|
case OP_NOT_WORDCHAR:
|
|
case OP_WORDCHAR:
|
|
case OP_ANY:
|
|
case OP_ALLANY:
|
|
branchlength++;
|
|
cc++;
|
|
break;
|
|
|
|
/* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
|
|
otherwise \C is coded as OP_ALLANY. */
|
|
|
|
case OP_ANYBYTE:
|
|
return -2;
|
|
|
|
/* Check a class for variable quantification */
|
|
|
|
case OP_CLASS:
|
|
case OP_NCLASS:
|
|
#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
|
|
case OP_XCLASS:
|
|
/* The original code caused an unsigned overflow in 64 bit systems,
|
|
so now we use a conditional statement. */
|
|
if (op == OP_XCLASS)
|
|
cc += GET(cc, 1);
|
|
else
|
|
cc += PRIV(OP_lengths)[OP_CLASS];
|
|
#else
|
|
cc += PRIV(OP_lengths)[OP_CLASS];
|
|
#endif
|
|
|
|
switch (*cc)
|
|
{
|
|
case OP_CRSTAR:
|
|
case OP_CRMINSTAR:
|
|
case OP_CRPLUS:
|
|
case OP_CRMINPLUS:
|
|
case OP_CRQUERY:
|
|
case OP_CRMINQUERY:
|
|
case OP_CRPOSSTAR:
|
|
case OP_CRPOSPLUS:
|
|
case OP_CRPOSQUERY:
|
|
return -1;
|
|
|
|
case OP_CRRANGE:
|
|
case OP_CRMINRANGE:
|
|
case OP_CRPOSRANGE:
|
|
if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
|
|
branchlength += (int)GET2(cc,1);
|
|
cc += 1 + 2 * IMM2_SIZE;
|
|
break;
|
|
|
|
default:
|
|
branchlength++;
|
|
}
|
|
break;
|
|
|
|
/* Anything else is variable length */
|
|
|
|
case OP_ANYNL:
|
|
case OP_BRAMINZERO:
|
|
case OP_BRAPOS:
|
|
case OP_BRAPOSZERO:
|
|
case OP_BRAZERO:
|
|
case OP_CBRAPOS:
|
|
case OP_EXTUNI:
|
|
case OP_KETRMAX:
|
|
case OP_KETRMIN:
|
|
case OP_KETRPOS:
|
|
case OP_MINPLUS:
|
|
case OP_MINPLUSI:
|
|
case OP_MINQUERY:
|
|
case OP_MINQUERYI:
|
|
case OP_MINSTAR:
|
|
case OP_MINSTARI:
|
|
case OP_MINUPTO:
|
|
case OP_MINUPTOI:
|
|
case OP_NOTMINPLUS:
|
|
case OP_NOTMINPLUSI:
|
|
case OP_NOTMINQUERY:
|
|
case OP_NOTMINQUERYI:
|
|
case OP_NOTMINSTAR:
|
|
case OP_NOTMINSTARI:
|
|
case OP_NOTMINUPTO:
|
|
case OP_NOTMINUPTOI:
|
|
case OP_NOTPLUS:
|
|
case OP_NOTPLUSI:
|
|
case OP_NOTPOSPLUS:
|
|
case OP_NOTPOSPLUSI:
|
|
case OP_NOTPOSQUERY:
|
|
case OP_NOTPOSQUERYI:
|
|
case OP_NOTPOSSTAR:
|
|
case OP_NOTPOSSTARI:
|
|
case OP_NOTPOSUPTO:
|
|
case OP_NOTPOSUPTOI:
|
|
case OP_NOTQUERY:
|
|
case OP_NOTQUERYI:
|
|
case OP_NOTSTAR:
|
|
case OP_NOTSTARI:
|
|
case OP_NOTUPTO:
|
|
case OP_NOTUPTOI:
|
|
case OP_PLUS:
|
|
case OP_PLUSI:
|
|
case OP_POSPLUS:
|
|
case OP_POSPLUSI:
|
|
case OP_POSQUERY:
|
|
case OP_POSQUERYI:
|
|
case OP_POSSTAR:
|
|
case OP_POSSTARI:
|
|
case OP_POSUPTO:
|
|
case OP_POSUPTOI:
|
|
case OP_QUERY:
|
|
case OP_QUERYI:
|
|
case OP_REF:
|
|
case OP_REFI:
|
|
case OP_DNREF:
|
|
case OP_DNREFI:
|
|
case OP_SBRA:
|
|
case OP_SBRAPOS:
|
|
case OP_SCBRA:
|
|
case OP_SCBRAPOS:
|
|
case OP_SCOND:
|
|
case OP_SKIPZERO:
|
|
case OP_STAR:
|
|
case OP_STARI:
|
|
case OP_TYPEMINPLUS:
|
|
case OP_TYPEMINQUERY:
|
|
case OP_TYPEMINSTAR:
|
|
case OP_TYPEMINUPTO:
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEPOSPLUS:
|
|
case OP_TYPEPOSQUERY:
|
|
case OP_TYPEPOSSTAR:
|
|
case OP_TYPEPOSUPTO:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEUPTO:
|
|
case OP_UPTO:
|
|
case OP_UPTOI:
|
|
return -1;
|
|
|
|
/* Catch unrecognized opcodes so that when new ones are added they
|
|
are not forgotten, as has happened in the past. */
|
|
|
|
default:
|
|
return -4;
|
|
}
|
|
}
|
|
/* Control never gets here */
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Scan compiled regex for specific bracket *
|
|
*************************************************/
|
|
|
|
/* This little function scans through a compiled pattern until it finds a
|
|
capturing bracket with the given number, or, if the number is negative, an
|
|
instance of OP_REVERSE for a lookbehind. The function is global in the C sense
|
|
so that it can be called from pcre_study() when finding the minimum matching
|
|
length.
|
|
|
|
Arguments:
|
|
code points to start of expression
|
|
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
|
|
number the required bracket number or negative to find a lookbehind
|
|
|
|
Returns: pointer to the opcode for the bracket, or NULL if not found
|
|
*/
|
|
|
|
const pcre_uchar *
|
|
PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
|
|
{
|
|
for (;;)
|
|
{
|
|
register pcre_uchar c = *code;
|
|
|
|
if (c == OP_END) return NULL;
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit
|
|
map. This includes negated single high-valued characters. The length in
|
|
the table is zero; the actual length is stored in the compiled code. */
|
|
|
|
if (c == OP_XCLASS) code += GET(code, 1);
|
|
|
|
/* Handle recursion */
|
|
|
|
else if (c == OP_REVERSE)
|
|
{
|
|
if (number < 0) return (pcre_uchar *)code;
|
|
code += PRIV(OP_lengths)[c];
|
|
}
|
|
|
|
/* Handle capturing bracket */
|
|
|
|
else if (c == OP_CBRA || c == OP_SCBRA ||
|
|
c == OP_CBRAPOS || c == OP_SCBRAPOS)
|
|
{
|
|
int n = (int)GET2(code, 1+LINK_SIZE);
|
|
if (n == number) return (pcre_uchar *)code;
|
|
code += PRIV(OP_lengths)[c];
|
|
}
|
|
|
|
/* Otherwise, we can get the item's length from the table, except that for
|
|
repeated character types, we have to test for \p and \P, which have an extra
|
|
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
|
|
must add in its length. */
|
|
|
|
else
|
|
{
|
|
switch(c)
|
|
{
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEMINPLUS:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPEMINQUERY:
|
|
case OP_TYPEPOSSTAR:
|
|
case OP_TYPEPOSPLUS:
|
|
case OP_TYPEPOSQUERY:
|
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
|
|
break;
|
|
|
|
case OP_TYPEUPTO:
|
|
case OP_TYPEMINUPTO:
|
|
case OP_TYPEEXACT:
|
|
case OP_TYPEPOSUPTO:
|
|
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
|
code += 2;
|
|
break;
|
|
|
|
case OP_MARK:
|
|
case OP_PRUNE_ARG:
|
|
case OP_SKIP_ARG:
|
|
case OP_THEN_ARG:
|
|
code += code[1];
|
|
break;
|
|
}
|
|
|
|
/* Add in the fixed length from the table */
|
|
|
|
code += PRIV(OP_lengths)[c];
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed by
|
|
a multi-byte character. The length in the table is a minimum, so we have to
|
|
arrange to skip the extra bytes. */
|
|
|
|
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
|
if (utf) switch(c)
|
|
{
|
|
case OP_CHAR:
|
|
case OP_CHARI:
|
|
case OP_NOT:
|
|
case OP_NOTI:
|
|
case OP_EXACT:
|
|
case OP_EXACTI:
|
|
case OP_NOTEXACT:
|
|
case OP_NOTEXACTI:
|
|
case OP_UPTO:
|
|
case OP_UPTOI:
|
|
case OP_NOTUPTO:
|
|
case OP_NOTUPTOI:
|
|
case OP_MINUPTO:
|
|
case OP_MINUPTOI:
|
|
case OP_NOTMINUPTO:
|
|
case OP_NOTMINUPTOI:
|
|
case OP_POSUPTO:
|
|
case OP_POSUPTOI:
|
|
case OP_NOTPOSUPTO:
|
|
case OP_NOTPOSUPTOI:
|
|
case OP_STAR:
|
|
case OP_STARI:
|
|
case OP_NOTSTAR:
|
|
case OP_NOTSTARI:
|
|
case OP_MINSTAR:
|
|
case OP_MINSTARI:
|
|
case OP_NOTMINSTAR:
|
|
case OP_NOTMINSTARI:
|
|
case OP_POSSTAR:
|
|
case OP_POSSTARI:
|
|
case OP_NOTPOSSTAR:
|
|
case OP_NOTPOSSTARI:
|
|
case OP_PLUS:
|
|
case OP_PLUSI:
|
|
case OP_NOTPLUS:
|
|
case OP_NOTPLUSI:
|
|
case OP_MINPLUS:
|
|
case OP_MINPLUSI:
|
|
case OP_NOTMINPLUS:
|
|
case OP_NOTMINPLUSI:
|
|
case OP_POSPLUS:
|
|
case OP_POSPLUSI:
|
|
case OP_NOTPOSPLUS:
|
|
case OP_NOTPOSPLUSI:
|
|
case OP_QUERY:
|
|
case OP_QUERYI:
|
|
case OP_NOTQUERY:
|
|
case OP_NOTQUERYI:
|
|
case OP_MINQUERY:
|
|
case OP_MINQUERYI:
|
|
case OP_NOTMINQUERY:
|
|
case OP_NOTMINQUERYI:
|
|
case OP_POSQUERY:
|
|
case OP_POSQUERYI:
|
|
case OP_NOTPOSQUERY:
|
|
case OP_NOTPOSQUERYI:
|
|
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
|
|
break;
|
|
}
|
|
#else
|
|
(void)(utf); /* Keep compiler happy by referencing function argument */
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Scan compiled regex for recursion reference *
|
|
*************************************************/
|
|
|
|
/* This little function scans through a compiled pattern until it finds an
|
|
instance of OP_RECURSE.
|
|
|
|
Arguments:
|
|
code points to start of expression
|
|
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode
|
|
|
|
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
|
|
*/
|
|
|
|
static const pcre_uchar *
|
|
find_recurse(const pcre_uchar *code, BOOL utf)
|
|
{
|
|
for (;;)
|
|
{
|
|
register pcre_uchar c = *code;
|
|
if (c == OP_END) return NULL;
|
|
if (c == OP_RECURSE) return code;
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit
|
|
map. This includes negated single high-valued characters. The length in
|
|
the table is zero; the actual length is stored in the compiled code. */
|
|
|
|
if (c == OP_XCLASS) code += GET(code, 1);
|
|
|
|
/* Otherwise, we can get the item's length from the table, except that for
|
|
repeated character types, we have to test for \p and \P, which have an extra
|
|
two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
|
|
must add in its length. */
|
|
|
|
else
|
|
{
|
|
switch(c)
|
|
{
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEMINPLUS:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPEMINQUERY:
|
|
case OP_TYPEPOSSTAR:
|
|
case OP_TYPEPOSPLUS:
|
|
case OP_TYPEPOSQUERY:
|
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
|
|
break;
|
|
|
|
case OP_TYPEPOSUPTO:
|
|
case OP_TYPEUPTO:
|
|
case OP_TYPEMINUPTO:
|
|
case OP_TYPEEXACT:
|
|
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
|
code += 2;
|
|
break;
|
|
|
|
case OP_MARK:
|
|
case OP_PRUNE_ARG:
|
|
case OP_SKIP_ARG:
|
|
case OP_THEN_ARG:
|
|
code += code[1];
|
|
break;
|
|
}
|
|
|
|
/* Add in the fixed length from the table */
|
|
|
|
code += PRIV(OP_lengths)[c];
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed
|
|
by a multi-byte character. The length in the table is a minimum, so we have
|
|
to arrange to skip the extra bytes. */
|
|
|
|
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
|
if (utf) switch(c)
|
|
{
|
|
case OP_CHAR:
|
|
case OP_CHARI:
|
|
case OP_NOT:
|
|
case OP_NOTI:
|
|
case OP_EXACT:
|
|
case OP_EXACTI:
|
|
case OP_NOTEXACT:
|
|
case OP_NOTEXACTI:
|
|
case OP_UPTO:
|
|
case OP_UPTOI:
|
|
case OP_NOTUPTO:
|
|
case OP_NOTUPTOI:
|
|
case OP_MINUPTO:
|
|
case OP_MINUPTOI:
|
|
case OP_NOTMINUPTO:
|
|
case OP_NOTMINUPTOI:
|
|
case OP_POSUPTO:
|
|
case OP_POSUPTOI:
|
|
case OP_NOTPOSUPTO:
|
|
case OP_NOTPOSUPTOI:
|
|
case OP_STAR:
|
|
case OP_STARI:
|
|
case OP_NOTSTAR:
|
|
case OP_NOTSTARI:
|
|
case OP_MINSTAR:
|
|
case OP_MINSTARI:
|
|
case OP_NOTMINSTAR:
|
|
case OP_NOTMINSTARI:
|
|
case OP_POSSTAR:
|
|
case OP_POSSTARI:
|
|
case OP_NOTPOSSTAR:
|
|
case OP_NOTPOSSTARI:
|
|
case OP_PLUS:
|
|
case OP_PLUSI:
|
|
case OP_NOTPLUS:
|
|
case OP_NOTPLUSI:
|
|
case OP_MINPLUS:
|
|
case OP_MINPLUSI:
|
|
case OP_NOTMINPLUS:
|
|
case OP_NOTMINPLUSI:
|
|
case OP_POSPLUS:
|
|
case OP_POSPLUSI:
|
|
case OP_NOTPOSPLUS:
|
|
case OP_NOTPOSPLUSI:
|
|
case OP_QUERY:
|
|
case OP_QUERYI:
|
|
case OP_NOTQUERY:
|
|
case OP_NOTQUERYI:
|
|
case OP_MINQUERY:
|
|
case OP_MINQUERYI:
|
|
case OP_NOTMINQUERY:
|
|
case OP_NOTMINQUERYI:
|
|
case OP_POSQUERY:
|
|
case OP_POSQUERYI:
|
|
case OP_NOTPOSQUERY:
|
|
case OP_NOTPOSQUERYI:
|
|
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
|
|
break;
|
|
}
|
|
#else
|
|
(void)(utf); /* Keep compiler happy by referencing function argument */
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Scan compiled branch for non-emptiness *
|
|
*************************************************/
|
|
|
|
/* This function scans through a branch of a compiled pattern to see whether it
|
|
can match the empty string or not. It is called from could_be_empty()
|
|
below and from compile_branch() when checking for an unlimited repeat of a
|
|
group that can match nothing. Note that first_significant_code() skips over
|
|
backward and negative forward assertions when its final argument is TRUE. If we
|
|
hit an unclosed bracket, we return "empty" - this means we've struck an inner
|
|
bracket whose current branch will already have been scanned.
|
|
|
|
Arguments:
|
|
code points to start of search
|
|
endcode points to where to stop
|
|
utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
|
|
cd contains pointers to tables etc.
|
|
recurses chain of recurse_check to catch mutual recursion
|
|
|
|
Returns: TRUE if what is matched could be empty
|
|
*/
|
|
|
|
static BOOL
|
|
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
|
|
BOOL utf, compile_data *cd, recurse_check *recurses)
|
|
{
|
|
register pcre_uchar c;
|
|
recurse_check this_recurse;
|
|
|
|
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
|
|
code < endcode;
|
|
code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
|
|
{
|
|
const pcre_uchar *ccode;
|
|
|
|
c = *code;
|
|
|
|
/* Skip over forward assertions; the other assertions are skipped by
|
|
first_significant_code() with a TRUE final argument. */
|
|
|
|
if (c == OP_ASSERT)
|
|
{
|
|
do code += GET(code, 1); while (*code == OP_ALT);
|
|
c = *code;
|
|
continue;
|
|
}
|
|
|
|
/* For a recursion/subroutine call, if its end has been reached, which
|
|
implies a backward reference subroutine call, we can scan it. If it's a
|
|
forward reference subroutine call, we can't. To detect forward reference
|
|
we have to scan up the list that is kept in the workspace. This function is
|
|
called only when doing the real compile, not during the pre-compile that
|
|
measures the size of the compiled pattern. */
|
|
|
|
if (c == OP_RECURSE)
|
|
{
|
|
const pcre_uchar *scode = cd->start_code + GET(code, 1);
|
|
const pcre_uchar *endgroup = scode;
|
|
BOOL empty_branch;
|
|
|
|
/* Test for forward reference or uncompleted reference. This is disabled
|
|
when called to scan a completed pattern by setting cd->start_workspace to
|
|
NULL. */
|
|
|
|
if (cd->start_workspace != NULL)
|
|
{
|
|
const pcre_uchar *tcode;
|
|
for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
|
|
if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
|
|
if (GET(scode, 1) == 0) return TRUE; /* Unclosed */
|
|
}
|
|
|
|
/* If the reference is to a completed group, we need to detect whether this
|
|
is a recursive call, as otherwise there will be an infinite loop. If it is
|
|
a recursion, just skip over it. Simple recursions are easily detected. For
|
|
mutual recursions we keep a chain on the stack. */
|
|
|
|
do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
|
|
if (code >= scode && code <= endgroup) continue; /* Simple recursion */
|
|
else
|
|
{
|
|
recurse_check *r = recurses;
|
|
for (r = recurses; r != NULL; r = r->prev)
|
|
if (r->group == scode) break;
|
|
if (r != NULL) continue; /* Mutual recursion */
|
|
}
|
|
|
|
/* Completed reference; scan the referenced group, remembering it on the
|
|
stack chain to detect mutual recursions. */
|
|
|
|
empty_branch = FALSE;
|
|
this_recurse.prev = recurses;
|
|
this_recurse.group = scode;
|
|
|
|
do
|
|
{
|
|
if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
|
|
{
|
|
empty_branch = TRUE;
|
|
break;
|
|
}
|
|
scode += GET(scode, 1);
|
|
}
|
|
while (*scode == OP_ALT);
|
|
|
|
if (!empty_branch) return FALSE; /* All branches are non-empty */
|
|
continue;
|
|
}
|
|
|
|
/* Groups with zero repeats can of course be empty; skip them. */
|
|
|
|
if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
|
|
c == OP_BRAPOSZERO)
|
|
{
|
|
code += PRIV(OP_lengths)[c];
|
|
do code += GET(code, 1); while (*code == OP_ALT);
|
|
c = *code;
|
|
continue;
|
|
}
|
|
|
|
/* A nested group that is already marked as "could be empty" can just be
|
|
skipped. */
|
|
|
|
if (c == OP_SBRA || c == OP_SBRAPOS ||
|
|
c == OP_SCBRA || c == OP_SCBRAPOS)
|
|
{
|
|
do code += GET(code, 1); while (*code == OP_ALT);
|
|
c = *code;
|
|
continue;
|
|
}
|
|
|
|
/* For other groups, scan the branches. */
|
|
|
|
if (c == OP_BRA || c == OP_BRAPOS ||
|
|
c == OP_CBRA || c == OP_CBRAPOS ||
|
|
c == OP_ONCE || c == OP_ONCE_NC ||
|
|
c == OP_COND || c == OP_SCOND)
|
|
{
|
|
BOOL empty_branch;
|
|
if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
|
|
|
|
/* If a conditional group has only one branch, there is a second, implied,
|
|
empty branch, so just skip over the conditional, because it could be empty.
|
|
Otherwise, scan the individual branches of the group. */
|
|
|
|
if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
|
|
code += GET(code, 1);
|
|
else
|
|
{
|
|
empty_branch = FALSE;
|
|
do
|
|
{
|
|
if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
|
|
recurses)) empty_branch = TRUE;
|
|
code += GET(code, 1);
|
|
}
|
|
while (*code == OP_ALT);
|
|
if (!empty_branch) return FALSE; /* All branches are non-empty */
|
|
}
|
|
|
|
c = *code;
|
|
continue;
|
|
}
|
|
|
|
/* Handle the other opcodes */
|
|
|
|
switch (c)
|
|
{
|
|
/* Check for quantifiers after a class. XCLASS is used for classes that
|
|
cannot be represented just by a bit map. This includes negated single
|
|
high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
|
|
actual length is stored in the compiled code, so we must update "code"
|
|
here. */
|
|
|
|
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
|
case OP_XCLASS:
|
|
ccode = code += GET(code, 1);
|
|
goto CHECK_CLASS_REPEAT;
|
|
#endif
|
|
|
|
case OP_CLASS:
|
|
case OP_NCLASS:
|
|
ccode = code + PRIV(OP_lengths)[OP_CLASS];
|
|
|
|
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
|
CHECK_CLASS_REPEAT:
|
|
#endif
|
|
|
|
switch (*ccode)
|
|
{
|
|
case OP_CRSTAR: /* These could be empty; continue */
|
|
case OP_CRMINSTAR:
|
|
case OP_CRQUERY:
|
|
case OP_CRMINQUERY:
|
|
case OP_CRPOSSTAR:
|
|
case OP_CRPOSQUERY:
|
|
break;
|
|
|
|
default: /* Non-repeat => class must match */
|
|
case OP_CRPLUS: /* These repeats aren't empty */
|
|
case OP_CRMINPLUS:
|
|
case OP_CRPOSPLUS:
|
|
return FALSE;
|
|
|
|
case OP_CRRANGE:
|
|
case OP_CRMINRANGE:
|
|
case OP_CRPOSRANGE:
|
|
if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
|
|
break;
|
|
}
|
|
break;
|
|
|
|
/* Opcodes that must match a character */
|
|
|
|
case OP_ANY:
|
|
case OP_ALLANY:
|
|
case OP_ANYBYTE:
|
|
|
|
case OP_PROP:
|
|
case OP_NOTPROP:
|
|
case OP_ANYNL:
|
|
|
|
case OP_NOT_HSPACE:
|
|
case OP_HSPACE:
|
|
case OP_NOT_VSPACE:
|
|
case OP_VSPACE:
|
|
case OP_EXTUNI:
|
|
|
|
case OP_NOT_DIGIT:
|
|
case OP_DIGIT:
|
|
case OP_NOT_WHITESPACE:
|
|
case OP_WHITESPACE:
|
|
case OP_NOT_WORDCHAR:
|
|
case OP_WORDCHAR:
|
|
|
|
case OP_CHAR:
|
|
case OP_CHARI:
|
|
case OP_NOT:
|
|
case OP_NOTI:
|
|
|
|
case OP_PLUS:
|
|
case OP_PLUSI:
|
|
case OP_MINPLUS:
|
|
case OP_MINPLUSI:
|
|
|
|
case OP_NOTPLUS:
|
|
case OP_NOTPLUSI:
|
|
case OP_NOTMINPLUS:
|
|
case OP_NOTMINPLUSI:
|
|
|
|
case OP_POSPLUS:
|
|
case OP_POSPLUSI:
|
|
case OP_NOTPOSPLUS:
|
|
case OP_NOTPOSPLUSI:
|
|
|
|
case OP_EXACT:
|
|
case OP_EXACTI:
|
|
case OP_NOTEXACT:
|
|
case OP_NOTEXACTI:
|
|
|
|
case OP_TYPEPLUS:
|
|
case OP_TYPEMINPLUS:
|
|
case OP_TYPEPOSPLUS:
|
|
case OP_TYPEEXACT:
|
|
|
|
return FALSE;
|
|
|
|
/* These are going to continue, as they may be empty, but we have to
|
|
fudge the length for the \p and \P cases. */
|
|
|
|
case OP_TYPESTAR:
|
|
case OP_TYPEMINSTAR:
|
|
case OP_TYPEPOSSTAR:
|
|
case OP_TYPEQUERY:
|
|
case OP_TYPEMINQUERY:
|
|
case OP_TYPEPOSQUERY:
|
|
if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
|
|
break;
|
|
|
|
/* Same for these */
|
|
|
|
case OP_TYPEUPTO:
|
|
case OP_TYPEMINUPTO:
|
|
case OP_TYPEPOSUPTO:
|
|
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
|
|
code += 2;
|
|
break;
|
|
|
|
/* End of branch */
|
|
|
|
case OP_KET:
|
|
case OP_KETRMAX:
|
|
case OP_KETRMIN:
|
|
case OP_KETRPOS:
|
|
case OP_ALT:
|
|
return TRUE;
|
|
|
|
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
|
|
MINUPTO, and POSUPTO and their caseless and negative versions may be
|
|
followed by a multibyte character. */
|
|
|
|
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
|
case OP_STAR:
|
|
case OP_STARI:
|
|
case OP_NOTSTAR:
|
|
case OP_NOTSTARI:
|
|
|
|
case OP_MINSTAR:
|
|
case OP_MINSTARI:
|
|
case OP_NOTMINSTAR:
|
|
case OP_NOTMINSTARI:
|
|
|
|
case OP_POSSTAR:
|
|
case OP_POSSTARI:
|
|
case OP_NOTPOSSTAR:
|
|
case OP_NOTPOSSTARI:
|
|
|
|
case OP_QUERY:
|
|
case OP_QUERYI:
|
|
case OP_NOTQUERY:
|
|
case OP_NOTQUERYI:
|
|
|
|
case OP_MINQUERY:
|
|
case OP_MINQUERYI:
|
|
case OP_NOTMINQUERY:
|
|
case OP_NOTMINQUERYI:
|
|
|
|
case OP_POSQUERY:
|
|
case OP_POSQUERYI:
|
|
case OP_NOTPOSQUERY:
|
|
case OP_NOTPOSQUERYI:
|
|
|
|
if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
|
|
break;
|
|
|
|
case OP_UPTO:
|
|
case OP_UPTOI:
|
|
case OP_NOTUPTO:
|
|
case OP_NOTUPTOI:
|
|
|
|
case OP_MINUPTO:
|
|
case OP_MINUPTOI:
|
|
case OP_NOTMINUPTO:
|
|
case OP_NOTMINUPTOI:
|
|
|
|
case OP_POSUPTO:
|
|
case OP_POSUPTOI:
|
|
case OP_NOTPOSUPTO:
|
|
case OP_NOTPOSUPTOI:
|
|
|
|
if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
|
|
break;
|
|
#endif
|
|
|
|
/* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
|
|
string. */
|
|
|
|
case OP_MARK:
|
|
case OP_PRUNE_ARG:
|
|
case OP_SKIP_ARG:
|
|
case OP_THEN_ARG:
|
|
code += code[1];
|
|
break;
|
|
|
|
/* None of the remaining opcodes are required to match a character. */
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
|
|
/*************************************************
|
|
* Scan compiled regex for non-emptiness *
|
|
*************************************************/
|
|
|
|
/* This function is called to check for left recursive calls. We want to check
|
|