summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorTSUYUSATO Kitsune <[email protected]>2022-10-20 16:52:23 +0900
committerYusuke Endoh <[email protected]>2022-11-09 23:21:26 +0900
commitf25bb291b42a45d23cfc8658720c62e1f3a7390f ()
tree2760425ac665c207dba7805424bd0d5073625b13 /regexec.c
parentea3d9893bf4d6c9b6016d5f7fe5a6cf820376e53 (diff)
Support OP_REPEAT and OP_REPEAT_INC
-rw-r--r--regexec.c218
1 files changed, 180 insertions, 38 deletions
@@ -235,12 +235,15 @@ onig_get_capture_tree(OnigRegion* region)
/* count number of jump-like opcodes for allocation of cache memory. */
/* return -1 if we cannot optimize the regex matching by using cache. */
-static int count_num_cache_opcode(regex_t* reg)
{
int num = 0;
UChar* p = reg->p;
UChar* pend = p + reg->used;
LengthType len;
OnigEncoding enc = reg->enc;
while (p < pend) {
@@ -295,10 +298,10 @@ static int count_num_cache_opcode(regex_t* reg)
break;
case OP_ANYCHAR_STAR:
case OP_ANYCHAR_ML_STAR:
- num++; break;
case OP_ANYCHAR_STAR_PEEK_NEXT:
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
- p++; num++; break;
case OP_WORD:
case OP_NOT_WORD:
@@ -352,19 +355,54 @@ static int count_num_cache_opcode(regex_t* reg)
case OP_PUSH:
p += SIZE_RELADDR;
num++;
break;
case OP_POP:
break;
case OP_PUSH_OR_JUMP_EXACT1:
case OP_PUSH_IF_PEEK_NEXT:
- p += SIZE_RELADDR + 1; num++; break;
case OP_REPEAT:
case OP_REPEAT_NG:
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
- // TODO: support OP_REPEAT opcodes.
return NUM_CACHE_OPCODE_FAIL;
case OP_NULL_CHECK_START:
case OP_NULL_CHECK_END:
@@ -410,12 +448,16 @@ static int count_num_cache_opcode(regex_t* reg)
return num;
}
-static void init_cache_index_table(regex_t* reg, UChar **table)
{
UChar* pbegin;
UChar* p = reg->p;
UChar* pend = p + reg->used;
LengthType len;
OnigEncoding enc = reg->enc;
while (p < pend) {
@@ -470,11 +512,20 @@ static void init_cache_index_table(regex_t* reg, UChar **table)
break;
case OP_ANYCHAR_STAR:
case OP_ANYCHAR_ML_STAR:
- *table++ = pbegin; break;
case OP_ANYCHAR_STAR_PEEK_NEXT:
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
p++;
- *table++ = pbegin;
break;
case OP_WORD:
@@ -528,17 +579,55 @@ static void init_cache_index_table(regex_t* reg, UChar **table)
break;
case OP_PUSH:
p += SIZE_RELADDR;
- *table++ = pbegin;
break;
case OP_POP:
break;
case OP_PUSH_OR_JUMP_EXACT1:
case OP_PUSH_IF_PEEK_NEXT:
- p += SIZE_RELADDR + 1; *table++ = pbegin; break;
case OP_REPEAT:
case OP_REPEAT_NG:
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
// TODO: support OP_REPEAT opcodes.
@@ -584,20 +673,6 @@ static void init_cache_index_table(regex_t* reg, UChar **table)
}
}
}
-
-static int find_cache_index_table(UChar** table, int num_cache_table, UChar* p)
-{
- int l = 0, r = num_cache_table - 1, m;
-
- while (l <= r) {
- m = (l + r) / 2;
- if (table[m] == p) return m;
- if (table[m] < p) l = m + 1;
- else r = m - 1;
- }
-
- return -1;
-}
#endif /* USE_MATCH_CACHE */
extern void
@@ -787,7 +862,7 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from)
(msa).enable_cache_match_opt = 0;\
(msa).num_fail = 0;\
(msa).num_cache_opcode = NUM_CACHE_OPCODE_UNINIT;\
- (msa).cache_index_table = (UChar **)0;\
(msa).match_cache = (uint8_t *)0;\
} while(0)
#define MATCH_ARG_FREE_CACHE_MATCH_OPT(msa) do {\
@@ -1083,23 +1158,70 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
#ifdef USE_CACHE_MATCH_OPT
-#define DO_CACHE_MATCH_OPT(enable,p,num_cache_table,table,pos,match_cache) do {\
if (enable) {\
- int cache_index = find_cache_index_table((table), (num_cache_table), (p));\
if (cache_index >= 0) {\
- int key = (num_cache_table) * (int)(pos) + cache_index;\
int index = key >> 3;\
int mask = 1 << (key & 7);\
if ((match_cache)[index] & mask) {\
goto fail;\
}\
(match_cache)[index] |= mask;\
}\
}\
} while (0)
#else
-#define DO_CACHE_MATCH_OPT(enable,p,num_cache_table,table,pos,match_cache)
#endif /* USE_CACHE_MATCH_OPT */
#define STACK_PUSH_REPEAT(id, pat) do {\
@@ -2587,7 +2709,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_ANYCHAR_STAR) MOP_IN(OP_ANYCHAR_STAR);
while (DATA_ENSURE_CHECK1) {
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p, s, sprev, pkeep);
n = enclen(encode, s, end);
DATA_ENSURE(n);
@@ -2600,7 +2722,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_ANYCHAR_ML_STAR) MOP_IN(OP_ANYCHAR_ML_STAR);
while (DATA_ENSURE_CHECK1) {
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p, s, sprev, pkeep);
n = enclen(encode, s, end);
if (n > 1) {
@@ -2619,7 +2741,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_ANYCHAR_STAR_PEEK_NEXT) MOP_IN(OP_ANYCHAR_STAR_PEEK_NEXT);
while (DATA_ENSURE_CHECK1) {
if (*p == *s) {
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p + 1, s, sprev, pkeep);
}
n = enclen(encode, s, end);
@@ -2635,7 +2757,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_ANYCHAR_ML_STAR_PEEK_NEXT)MOP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT);
while (DATA_ENSURE_CHECK1) {
if (*p == *s) {
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p + 1, s, sprev, pkeep);
}
n = enclen(encode, s, end);
@@ -3275,7 +3397,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_PUSH) MOP_IN(OP_PUSH);
GET_RELADDR_INC(addr, p);
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p + addr, s, sprev, pkeep);
MOP_OUT;
JUMP;
@@ -3329,7 +3451,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_RELADDR_INC(addr, p);
if (*p == *s && DATA_ENSURE_CHECK1) {
p++;
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p + addr, s, sprev, pkeep);
MOP_OUT;
JUMP;
@@ -3343,7 +3465,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_RELADDR_INC(addr, p);
if (*p == *s) {
p++;
- DO_CACHE_MATCH_OPT(msa->enable_cache_match_opt, pbegin, msa->num_cache_opcode, msa->cache_index_table, end - s, msa->match_cache);
STACK_PUSH_ALT(p + addr, s, sprev, pkeep);
MOP_OUT;
JUMP;
@@ -3362,6 +3484,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
STACK_PUSH_REPEAT(mem, p);
if (reg->repeat_range[mem].lower == 0) {
STACK_PUSH_ALT(p + addr, s, sprev, pkeep);
}
}
@@ -3378,6 +3501,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
STACK_PUSH_REPEAT(mem, p);
if (reg->repeat_range[mem].lower == 0) {
STACK_PUSH_ALT(p, s, sprev, pkeep);
p += addr;
}
@@ -3396,6 +3520,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
/* end of repeat. Nothing to do. */
}
else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) {
STACK_PUSH_ALT(p, s, sprev, pkeep);
p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */
}
@@ -3426,6 +3553,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
UChar* pcode = stkp->u.repeat.pcode;
STACK_PUSH_REPEAT_INC(si);
STACK_PUSH_ALT(pcode, s, sprev, pkeep);
}
else {
@@ -3615,22 +3745,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
#ifdef USE_CACHE_MATCH_OPT
if (++msa->num_fail >= (int)(end - str) + 1 && msa->num_cache_opcode == NUM_CACHE_OPCODE_UNINIT) {
msa->enable_cache_match_opt = 1;
if (msa->num_cache_opcode == NUM_CACHE_OPCODE_UNINIT) {
- msa->num_cache_opcode = count_num_cache_opcode(reg);
}
if (msa->num_cache_opcode == NUM_CACHE_OPCODE_FAIL || msa->num_cache_opcode == 0) {
msa->enable_cache_match_opt = 0;
goto fail_match_cache_opt;
}
if (msa->cache_index_table == NULL) {
- UChar **table = xmalloc(msa->num_cache_opcode * sizeof(UChar*));
if (table == NULL) {
msa->enable_cache_match_opt = 0;
goto fail_match_cache_opt;
}
init_cache_index_table(reg, table);
msa->cache_index_table = table;
}
// TODO: check arithemetic overflow.
int match_cache_size8 = msa->num_cache_opcode * ((int)(end - str) + 1);
@@ -3641,6 +3773,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
goto fail_match_cache_opt;
}
xmemset(msa->match_cache, 0, match_cache_size * sizeof(uint8_t));
}
fail_match_cache_opt:
#endif