Skip to content

JIT bug: Mismatch behavior with variable-length lookbehind containing capture group and backreference #911

@lijiang99

Description

@lijiang99

There is a discrepancy in matching results between the non-JIT

and the JIT compiler when using a variable-length lookbehind that contains a capture group, followed by a backreference to that group.

  • Reproducible Example:
    • Subject: "BABA"
    • Pattern: "(?<*(.).{,9})(?=.*\1)"
    • Replacement: "|"
  • Steps to Reproduce:
    • Run the pattern against the subject string with substitution, comparing --jit and --no-jit.(Tested with try_sub16.cpp)
  • Actual Behavior (with JIT enabled):
    • Matches found: 2
    • Substitution result: "B|A|BA"
  • Expected Behavior (with JIT disabled):
    • Matches found: 3
    • Substitution result: "B|A|B|A"
  • Environment:
    • OS/Compiler: Windows/MSVC
#define PCRE2_CODE_UNIT_WIDTH 16
#include <pcre2.h>

#include <cstdio>
#include <cstring>
#include <cwchar>

static_assert(sizeof(wchar_t) == 2,
              "this sample assumes wchar_t is 16-bit (Windows/MSVC)");

static void print_usage(const char *prog)
{
    std::printf(
        "usage: %s [--jit | --no-jit]\n"
        "  --jit, -j      enable JIT compilation (default)\n"
        "  --no-jit, -n   disable JIT, use interpreter\n"
        "  --help, -h     show this help\n",
        prog);
}

int main(int argc, char **argv)
{
    bool use_jit = true;

    for (int i = 1; i < argc; ++i) {
        const char *a = argv[i];
        if (std::strcmp(a, "--jit") == 0 || std::strcmp(a, "-j") == 0) {
            use_jit = true;
        } else if (std::strcmp(a, "--no-jit") == 0 || std::strcmp(a, "-n") == 0) {
            use_jit = false;
        } else if (std::strcmp(a, "--help") == 0 || std::strcmp(a, "-h") == 0) {
            print_usage(argv[0]);
            return 0;
        } else {
            std::fprintf(stderr, "unknown option: %s\n", a);
            print_usage(argv[0]);
            return 64;
        }
    }

    const wchar_t *subject     = L"BABA";
    const wchar_t *pattern     = L"(?<*(.).{,9})(?=.*\\1)";
    const wchar_t *replacement = L"|";

    PCRE2_SIZE subject_len     = std::wcslen(subject);
    PCRE2_SIZE replacement_len = std::wcslen(replacement);

    int        errornumber = 0;
    PCRE2_SIZE erroroffset = 0;

    pcre2_code *re = pcre2_compile(
        reinterpret_cast<PCRE2_SPTR>(pattern),
        PCRE2_ZERO_TERMINATED,
        0,
        &errornumber,
        &erroroffset,
        nullptr);

    if (re == nullptr) {
        wchar_t buf[256];
        pcre2_get_error_message(errornumber,
                                reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
        std::fwprintf(stderr, L"compile failed at offset %zu: %ls\n",
                      (size_t)erroroffset, buf);
        return 1;
    }

    bool jit_ready = false;
    if (use_jit) {
        int jit_rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
        if (jit_rc < 0) {
            wchar_t buf[256];
            pcre2_get_error_message(jit_rc,
                                    reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
            std::fwprintf(stderr,
                          L"JIT compile failed: %ls (will fall back to interpreter)\n",
                          buf);
        } else {
            jit_ready = true;
        }
    }

    uint32_t sub_options = PCRE2_SUBSTITUTE_GLOBAL;
    if (!use_jit) sub_options |= PCRE2_NO_JIT;

    wchar_t    outbuf[64];
    PCRE2_SIZE outlen = sizeof(outbuf) / sizeof(outbuf[0]);

    int rc = pcre2_substitute(
        re,
        reinterpret_cast<PCRE2_SPTR>(subject),
        subject_len,
        0,
        sub_options,
        nullptr,
        nullptr,
        reinterpret_cast<PCRE2_SPTR>(replacement),
        replacement_len,
        reinterpret_cast<PCRE2_UCHAR *>(outbuf),
        &outlen);

    if (rc < 0) {
        wchar_t buf[256];
        pcre2_get_error_message(rc,
                                reinterpret_cast<PCRE2_UCHAR *>(buf), 256);
        std::fwprintf(stderr, L"substitute failed: %ls\n", buf);
        pcre2_code_free(re);
        return 3;
    }

    outbuf[outlen] = 0;
    const char *jit_state = use_jit
        ? (jit_ready ? "enabled" : "requested-but-failed")
        : "disabled";

    std::wprintf(L"subject     : \"%ls\"\n", subject);
    std::wprintf(L"pattern     : %ls\n",     pattern);
    std::wprintf(L"replacement : \"%ls\"\n", replacement);
    std::wprintf(L"JIT         : %hs\n",     jit_state);
    std::wprintf(L"matches     : %d\n",      rc);
    std::wprintf(L"result      : \"%ls\" (%zu code units)\n",
                 outbuf, (size_t)outlen);

    pcre2_code_free(re);
    return 0;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions