There is a discrepancy in matching results between the non-JIT
and the JIT compiler when using a variable-length lookbehind that contains a capture group, followed by a backreference to that group.
- Reproducible Example:
- Subject: "BABA"
- Pattern: "(?<*(.).{,9})(?=.*\1)"
- Replacement: "|"
- Steps to Reproduce:
- Run the pattern against the subject string with substitution, comparing --jit and --no-jit.(Tested with try_sub16.cpp)
- Actual Behavior (with JIT enabled):
- Matches found: 2
- Substitution result: "B|A|BA"
- Expected Behavior (with JIT disabled):
- Matches found: 3
- Substitution result: "B|A|B|A"
- Environment:
- OS/Compiler: Windows/MSVC
#define PCRE2_CODE_UNIT_WIDTH 16
#include <pcre2.h>
#include <cstdio>
#include <cstring>
#include <cwchar>
static_assert(sizeof(wchar_t) == 2,
"this sample assumes wchar_t is 16-bit (Windows/MSVC)");
static void print_usage(const char *prog)
{
std::printf(
"usage: %s [--jit | --no-jit]\n"
" --jit, -j enable JIT compilation (default)\n"
" --no-jit, -n disable JIT, use interpreter\n"
" --help, -h show this help\n",
prog);
}
int main(int argc, char **argv)
{
bool use_jit = true;
for (int i = 1; i < argc; ++i) {
const char *a = argv[i];
if (std::strcmp(a, "--jit") == 0 || std::strcmp(a, "-j") == 0) {
use_jit = true;
} else if (std::strcmp(a, "--no-jit") == 0 || std::strcmp(a, "-n") == 0) {
use_jit = false;
} else if (std::strcmp(a, "--help") == 0 || std::strcmp(a, "-h") == 0) {
print_usage(argv[0]);
return 0;
} else {
std::fprintf(stderr, "unknown option: %s\n", a);
print_usage(argv[0]);
return 64;
}
}
const wchar_t *subject = L"BABA";
const wchar_t *pattern = L"(?<*(.).{,9})(?=.*\\1)";
const wchar_t *replacement = L"|";
PCRE2_SIZE subject_len = std::wcslen(subject);
PCRE2_SIZE replacement_len = std::wcslen(replacement);
int errornumber = 0;
PCRE2_SIZE erroroffset = 0;
pcre2_code *re = pcre2_compile(
reinterpret_cast<PCRE2_SPTR>(pattern),
PCRE2_ZERO_TERMINATED,
0,
&errornumber,
&erroroffset,
nullptr);
if (re == nullptr) {
wchar_t buf[256];
pcre2_get_error_message(errornumber,
reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
std::fwprintf(stderr, L"compile failed at offset %zu: %ls\n",
(size_t)erroroffset, buf);
return 1;
}
bool jit_ready = false;
if (use_jit) {
int jit_rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
if (jit_rc < 0) {
wchar_t buf[256];
pcre2_get_error_message(jit_rc,
reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
std::fwprintf(stderr,
L"JIT compile failed: %ls (will fall back to interpreter)\n",
buf);
} else {
jit_ready = true;
}
}
uint32_t sub_options = PCRE2_SUBSTITUTE_GLOBAL;
if (!use_jit) sub_options |= PCRE2_NO_JIT;
wchar_t outbuf[64];
PCRE2_SIZE outlen = sizeof(outbuf) / sizeof(outbuf[0]);
int rc = pcre2_substitute(
re,
reinterpret_cast<PCRE2_SPTR>(subject),
subject_len,
0,
sub_options,
nullptr,
nullptr,
reinterpret_cast<PCRE2_SPTR>(replacement),
replacement_len,
reinterpret_cast<PCRE2_UCHAR *>(outbuf),
&outlen);
if (rc < 0) {
wchar_t buf[256];
pcre2_get_error_message(rc,
reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
std::fwprintf(stderr, L"substitute failed: %ls\n", buf);
pcre2_code_free(re);
return 3;
}
outbuf[outlen] = 0;
const char *jit_state = use_jit
? (jit_ready ? "enabled" : "requested-but-failed")
: "disabled";
std::wprintf(L"subject : \"%ls\"\n", subject);
std::wprintf(L"pattern : %ls\n", pattern);
std::wprintf(L"replacement : \"%ls\"\n", replacement);
std::wprintf(L"JIT : %hs\n", jit_state);
std::wprintf(L"matches : %d\n", rc);
std::wprintf(L"result : \"%ls\" (%zu code units)\n",
outbuf, (size_t)outlen);
pcre2_code_free(re);
return 0;
}
There is a discrepancy in matching results between the non-JIT
and the JIT compiler when using a variable-length lookbehind that contains a capture group, followed by a backreference to that group.