diff --git a/re2/compile.cc b/re2/compile.cc index 95c1b32d..1c3806f0 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -1077,6 +1077,54 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { return false; } +// Detects whether the simplified regexp ends with a trailing match-all +// (i.e. .* where . matches any character). Optionally followed by +// end-of-line/end-of-text anchors, and optionally wrapped in a +// capture group. Sets has_trailing_match_all and trailing_match_all_cap +// on prog. Only applicable for the forward (non-reversed) program. +static void DetectTrailingMatchAll(Regexp* re, Prog* prog) { + // Walk through Concat to find the last meaningful child. + // Skip trailing end-of-line / end-of-text anchors since they + // are always satisfied after a greedy .* that matches all chars. + Regexp* last = re; + if (last->op() == kRegexpConcat && last->nsub() > 0) { + int idx = last->nsub() - 1; + // Skip trailing anchors ($ or \z), and empty matches left behind + // by IsAnchorEnd stripping \z -- they are always satisfied + // after .* that matches any character. + while (idx > 0 && + (last->sub()[idx]->op() == kRegexpEndLine || + last->sub()[idx]->op() == kRegexpEndText || + last->sub()[idx]->op() == kRegexpEmptyMatch)) + idx--; + last = last->sub()[idx]; + } + + // Check for optional capture wrapping. + int cap = -1; + if (last->op() == kRegexpCapture) { + cap = last->cap(); + last = last->sub()[0]; + // Look through another Concat if present (e.g. the capture might + // contain a concat with .* as the last element, but typically + // it's just .* directly). + if (last->op() == kRegexpConcat && last->nsub() > 0) + last = last->sub()[last->nsub() - 1]; + } + + // Now last should be a greedy kRegexpStar of a match-all in OneLine mode. + // Non-greedy .* doesn't necessarily consume all remaining text. + // Without OneLine, ^ and $ are line-oriented and the optimization + // may produce incorrect results for multiline patterns. + if (last->op() == kRegexpStar && last->nsub() == 1 && + (last->parse_flags() & Regexp::OneLine) && + !(last->parse_flags() & Regexp::NonGreedy) && + last->sub()[0]->op() == kRegexpAnyChar) { + prog->set_has_trailing_match_all(true); + prog->set_trailing_match_all_cap(cap); + } +} + void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { if (flags & Regexp::Latin1) @@ -1128,6 +1176,13 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { bool is_anchor_start = IsAnchorStart(&sre, 0); bool is_anchor_end = IsAnchorEnd(&sre, 0); + // Detect trailing match-all (e.g. (?s:.*)$) before compilation. + // Only for the forward program. The trailing .* matches all + // remaining text regardless of whether there's an explicit end + // anchor, because it's a greedy match of all characters. + if (!reversed) + DetectTrailingMatchAll(sre, c.prog_); + // Generate fragment for entire regexp. Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); sre->Decref(); diff --git a/re2/prog.cc b/re2/prog.cc index c60d930e..a5616273 100644 --- a/re2/prog.cc +++ b/re2/prog.cc @@ -117,6 +117,8 @@ Prog::Prog() reversed_(false), did_flatten_(false), did_onepass_(false), + has_trailing_match_all_(false), + trailing_match_all_cap_(-1), start_(0), start_unanchored_(0), size_(0), diff --git a/re2/prog.h b/re2/prog.h index 801f8b99..81c98210 100644 --- a/re2/prog.h +++ b/re2/prog.h @@ -241,6 +241,10 @@ class Prog { void set_anchor_start(bool b) { anchor_start_ = b; } bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } + bool has_trailing_match_all() { return has_trailing_match_all_; } + void set_has_trailing_match_all(bool b) { has_trailing_match_all_ = b; } + int trailing_match_all_cap() { return trailing_match_all_cap_; } + void set_trailing_match_all_cap(int cap) { trailing_match_all_cap_ = cap; } int bytemap_range() { return bytemap_range_; } const uint8_t* bytemap() { return bytemap_; } bool can_prefix_accel() { return prefix_size_ != 0; } @@ -439,6 +443,8 @@ class Prog { bool reversed_; // whether program runs backward over input bool did_flatten_; // has Flatten been called? bool did_onepass_; // has IsOnePass been called? + bool has_trailing_match_all_; // pattern ends with .* (any-char star) + int trailing_match_all_cap_; // capture index wrapping trailing .*, or -1 int start_; // entry point for program int start_unanchored_; // unanchored entry point for program diff --git a/re2/re2.cc b/re2/re2.cc index 2e00d228..f4c2840b 100644 --- a/re2/re2.cc +++ b/re2/re2.cc @@ -45,6 +45,78 @@ namespace re2 { // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. static int maximum_global_replace_count = -1; +// Strips a trailing greedy match-all (.* where . is kRegexpAnyChar) +// from the Regexp tree *pre. Looks through trailing anchors ($, \z), +// empty matches, and optional capture groups. Returns true if stripping +// occurred. Only strips greedy stars in OneLine mode, consistent with +// DetectTrailingMatchAll in compile.cc. +// Caller owns the resulting *pre (which has been Incref'd as needed). +static bool StripTrailingMatchAll(Regexp** pre) { + Regexp* re = *pre; + if (re == NULL) + return false; + switch (re->op()) { + default: + break; + case kRegexpConcat: + if (re->nsub() > 0) { + // Find last non-anchor/non-empty child. + int last = re->nsub() - 1; + while (last > 0 && + (re->sub()[last]->op() == kRegexpEndLine || + re->sub()[last]->op() == kRegexpEndText || + re->sub()[last]->op() == kRegexpEmptyMatch)) + last--; + Regexp* sub = re->sub()[last]->Incref(); + if (StripTrailingMatchAll(&sub)) { + // Rebuild concat: children 0..last-1, replace last with sub + // (skip sub if it's just an empty placeholder). + std::vector subcopy; + for (int i = 0; i < last; i++) + subcopy.push_back(re->sub()[i]->Incref()); + if (sub->op() != kRegexpEmptyMatch) + subcopy.push_back(sub); + else + sub->Decref(); + if (subcopy.empty()) { + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + } else { + *pre = Regexp::Concat(subcopy.data(), + static_cast(subcopy.size()), + re->parse_flags()); + } + re->Decref(); + return true; + } + sub->Decref(); + } + break; + case kRegexpCapture: { + Regexp* sub = re->sub()[0]->Incref(); + if (StripTrailingMatchAll(&sub)) { + *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); + re->Decref(); + return true; + } + sub->Decref(); + break; + } + case kRegexpStar: { + // Only strip greedy stars in OneLine mode — must match the + // conditions in DetectTrailingMatchAll (compile.cc). + if ((re->parse_flags() & Regexp::OneLine) && + !(re->parse_flags() & Regexp::NonGreedy) && + re->sub()[0]->op() == kRegexpAnyChar) { + *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); + re->Decref(); + return true; + } + break; + } + } + return false; +} + void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) { maximum_global_replace_count = i; } @@ -227,6 +299,8 @@ void RE2::Init(absl::string_view pattern, const Options& options) { prefix_foldcase_ = false; prefix_.clear(); prog_ = NULL; + prefix_match_ = NULL; + trailing_match_all_cap_ = -1; rprog_ = NULL; named_groups_ = NULL; @@ -281,6 +355,21 @@ void RE2::Init(absl::string_view pattern, const Options& options) { // and that is harder to do if the DFA has already // been built. is_one_pass_ = prog_->IsOnePass(); + + // If the pattern ends with a trailing match-all (e.g. (?s:.*)$), + // build a prefix RE2 with the .* stripped for fast matching. + // Detection runs on the simplified tree, so strip from that too. + if (prog_->has_trailing_match_all()) { + trailing_match_all_cap_ = prog_->trailing_match_all_cap(); + re2::Regexp* stripped = entire_regexp_->Incref(); + if (StripTrailingMatchAll(&stripped)) { + std::string stripped_pattern = stripped->ToString(); + stripped->Decref(); + prefix_match_ = new RE2(stripped_pattern, options_); + } else { + stripped->Decref(); + } + } } // Returns rprog_, computing it if needed. @@ -303,6 +392,7 @@ re2::Prog* RE2::ReverseProg() const { } RE2::~RE2() { + delete prefix_match_; if (group_names_ != empty_group_names()) delete group_names_; if (named_groups_ != empty_named_groups()) @@ -676,6 +766,35 @@ bool RE2::Match(absl::string_view text, return false; } + // Fast path: if we have a prefix RE2 (trailing match-all was stripped), + // delegate to it and extend the results to cover all remaining text. + if (prefix_match_ != NULL && prefix_match_->ok()) { + // The prefix RE2 doesn't have the trailing .*, so for ANCHOR_BOTH + // (full match), downgrade to ANCHOR_START — the .* would have + // consumed everything to end-of-text. + Anchor prefix_anchor = re_anchor; + if (prefix_anchor == ANCHOR_BOTH) + prefix_anchor = ANCHOR_START; + if (!prefix_match_->Match(text, startpos, endpos, prefix_anchor, + submatch, nsubmatch)) + return false; + if (nsubmatch > 0) { + // Extend overall match to endpos. + submatch[0] = absl::string_view( + submatch[0].data(), + static_cast(text.data() + endpos - submatch[0].data())); + // Extend trailing capture group if present. + int cap = trailing_match_all_cap_; + if (cap >= 0 && cap < nsubmatch && + submatch[cap].data() != nullptr) + submatch[cap] = absl::string_view( + submatch[cap].data(), + static_cast(text.data() + endpos - + submatch[cap].data())); + } + return true; + } + absl::string_view subtext = text; subtext.remove_prefix(startpos); subtext.remove_suffix(text.size() - endpos); diff --git a/re2/re2.h b/re2/re2.h index eed332d0..d897cef2 100644 --- a/re2/re2.h +++ b/re2/re2.h @@ -798,6 +798,12 @@ class RE2 { std::string prefix_; // required prefix (before suffix_regexp_) re2::Prog* prog_; // compiled program for regexp + // If the pattern ends with a trailing match-all (e.g. (?s:.*)$), + // this is a compiled RE2 for the prefix (with .* stripped). + // Match delegates to this and extends the results to end-of-text. + RE2* prefix_match_; + int trailing_match_all_cap_; // capture index wrapping .*, or -1 + // Reverse Prog for DFA execution only mutable re2::Prog* rprog_; // Map from capture names to indices diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 5c1822d7..d9c4ed82 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -1696,4 +1696,152 @@ TEST(RE2, InitNULL) { ASSERT_TRUE(RE2::FullMatch("", NULL)); } +TEST(RE2, TrailingMatchAll) { + // Test that patterns ending with .* (when dot matches newline) + // produce correct results via the trailing match-all fast path. + RE2::Options opt; + opt.set_dot_nl(true); + + // Basic: FullMatch with trailing .* (no capture on the tail). + { + RE2 re("^https?://(?:www\\.)?([^/]+)/.*$", opt); + absl::string_view host; + ASSERT_TRUE(RE2::FullMatch("https://example.com/foo/bar", re, &host)); + EXPECT_EQ(host, "example.com"); + } + + // Large trailing text should produce identical results. + { + RE2 re("^prefix:([a-z]+)(.*)$", opt); + std::string input = "prefix:hello"; + input.append(100000, 'X'); + absl::string_view word, rest; + ASSERT_TRUE(RE2::FullMatch(input, re, &word, &rest)); + EXPECT_EQ(word, "hello"); + EXPECT_EQ(rest.size(), 100000u); + EXPECT_EQ(rest.data(), input.data() + 12); + } + + // Trailing .* without capture group. + { + RE2 re("^([a-z]+).*$", opt); + absl::string_view word; + ASSERT_TRUE(RE2::FullMatch("helloWORLD", re, &word)); + EXPECT_EQ(word, "hello"); + } + + // Replace with trailing .* capture. + { + RE2 re("^([^:]+):(.*)$", opt); + std::string s = "key:value_with_long_suffix"; + ASSERT_TRUE(RE2::Replace(&s, re, "\\1=\\2")); + EXPECT_EQ(s, "key=value_with_long_suffix"); + } + + // GlobalReplace with trailing .*. + { + RE2 re("^([^:]+):(.*)$", opt); + std::string s = "key:value"; + ASSERT_EQ(RE2::GlobalReplace(&s, re, "\\1"), 1); + EXPECT_EQ(s, "key"); + } + + // Consume with trailing .*. + { + RE2 re("([a-z]+).*", opt); + absl::string_view input("helloWORLD"); + absl::string_view word; + ASSERT_TRUE(RE2::Consume(&input, re, &word)); + EXPECT_EQ(word, "hello"); + EXPECT_TRUE(input.empty()); + } + + // No match should still fail correctly. + { + RE2 re("^NOMATCH.*$", opt); + ASSERT_FALSE(RE2::FullMatch("hello", re)); + } + + // Without dot_nl, .* is [^\n]* -- no fast path, but still correct. + { + RE2 re_no_s("^([a-z]+).*$"); + absl::string_view word; + ASSERT_TRUE(RE2::FullMatch("helloWORLD", re_no_s, &word)); + EXPECT_EQ(word, "hello"); + // With embedded newline, should not match ($ requires end-of-text). + ASSERT_FALSE(RE2::FullMatch("hello\nworld", re_no_s, &word)); + } + + // PartialMatch with trailing .*. + { + RE2 re("([0-9]+).*", opt); + absl::string_view num; + ASSERT_TRUE(RE2::PartialMatch("abc123xyz", re, &num)); + EXPECT_EQ(num, "123"); + } + + // nsubmatch == 0: just checking match, no captures. + { + RE2 re("^https?://.*$", opt); + ASSERT_TRUE(RE2::FullMatch("https://example.com/foo", re)); + ASSERT_FALSE(RE2::FullMatch("not a url", re)); + } + + // Explicit Match() with ANCHOR_START. + { + RE2 re("([a-z]+).*", opt); + absl::string_view submatch[2]; + ASSERT_TRUE(re.Match("helloWORLD", 0, 10, RE2::ANCHOR_START, + submatch, 2)); + EXPECT_EQ(submatch[0], "helloWORLD"); + EXPECT_EQ(submatch[1], "hello"); + } + + // Sub-range matching via Match(). + { + RE2 re("([a-z]+).*", opt); + absl::string_view submatch[2]; + ASSERT_TRUE(re.Match("XXhelloWORLDXX", 2, 12, RE2::UNANCHORED, + submatch, 2)); + EXPECT_EQ(submatch[0], "helloWORLD"); + EXPECT_EQ(submatch[1], "hello"); + } + + // Nested capture groups around .*. + { + RE2 re("^([a-z]+)((.*))$", opt); + absl::string_view word, outer, inner; + ASSERT_TRUE(RE2::FullMatch("helloWORLD", re, &word, &outer, &inner)); + EXPECT_EQ(word, "hello"); + EXPECT_EQ(outer, "WORLD"); + EXPECT_EQ(inner, "WORLD"); + } + + // Empty text matching .* + { + RE2 re("^.*$", opt); + ASSERT_TRUE(RE2::FullMatch("", re)); + } + + // FindAndConsume with trailing .*. + { + RE2 re("([a-z]+).*", opt); + absl::string_view input("helloWORLD"); + absl::string_view word; + ASSERT_TRUE(RE2::FindAndConsume(&input, re, &word)); + EXPECT_EQ(word, "hello"); + EXPECT_TRUE(input.empty()); + } + + // Pattern with .* in the middle — should NOT trigger optimization. + // Greedy .* matches as much as possible, leaving only "3" for [0-9]+. + { + RE2 re("^(.*)([0-9]+)$", opt); + absl::string_view prefix, num; + ASSERT_TRUE(RE2::FullMatch("hello123", re, &prefix, &num)); + EXPECT_EQ(prefix, "hello12"); + EXPECT_EQ(num, "3"); + } +} + } // namespace re2 diff --git a/re2/testing/regexp_benchmark.cc b/re2/testing/regexp_benchmark.cc index 3940467d..6211890f 100644 --- a/re2/testing/regexp_benchmark.cc +++ b/re2/testing/regexp_benchmark.cc @@ -1602,4 +1602,59 @@ BENCHMARK(PossibleMatchRange_Complex); BENCHMARK(PossibleMatchRange_Prefix); BENCHMARK(PossibleMatchRange_NoProg); +// Benchmarks for trailing match-all optimization. +// Tests matching a URL pattern against a URL with a large suffix. + +void URLMatchRE2(benchmark::State& state) { + // Build a URL string: a short prefix followed by a large suffix. + std::string url = "https://www.example.com/"; + url.append(state.range(0), 'x'); + + RE2::Options opt; + opt.set_dot_nl(true); + RE2 re(R"(^https?://(?:www\.)?([^/]+)/.*$)", opt); + ABSL_CHECK(re.ok()); + for (auto _ : state) { + absl::string_view host; + ABSL_CHECK(RE2::FullMatch(url, re, &host)); + } +} + +BENCHMARK(URLMatchRE2)->Arg(1 << 10)->Arg(1 << 16)->Arg(1 << 20); + +void URLExtractRE2(benchmark::State& state) { + std::string url = "https://www.example.com/"; + url.append(state.range(0), 'x'); + + RE2::Options opt; + opt.set_dot_nl(true); + RE2 re(R"(^https?://(?:www\.)?([^/]+)/.*$)", opt); + ABSL_CHECK(re.ok()); + for (auto _ : state) { + std::string host; + ABSL_CHECK(RE2::Extract(url, re, R"(\1)", &host)); + } +} + +BENCHMARK(URLExtractRE2)->Arg(1 << 10)->Arg(1 << 16)->Arg(1 << 20); + +void URLGlobalReplaceRE2(benchmark::State& state) { + std::string url = "https://www.example.com/"; + url.append(state.range(0), 'x'); + + RE2::Options opt; + opt.set_dot_nl(true); + RE2 re(R"(^https?://(?:www\.)?([^/]+)/.*$)", opt); + ABSL_CHECK(re.ok()); + std::string s; + for (auto _ : state) { + state.PauseTiming(); + s = url; + state.ResumeTiming(); + ABSL_CHECK_EQ(RE2::GlobalReplace(&s, re, R"(\1)"), 1); + } +} + +BENCHMARK(URLGlobalReplaceRE2)->Arg(1 << 10)->Arg(1 << 16)->Arg(1 << 20); + } // namespace re2