Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions re2/compile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,54 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
return false;
}

// Detects whether the simplified regexp ends with a trailing match-all
// (i.e. .* where . matches any character). Optionally followed by
// end-of-line/end-of-text anchors, and optionally wrapped in a
// capture group. Sets has_trailing_match_all and trailing_match_all_cap
// on prog. Only applicable for the forward (non-reversed) program.
static void DetectTrailingMatchAll(Regexp* re, Prog* prog) {
// Walk through Concat to find the last meaningful child.
// Skip trailing end-of-line / end-of-text anchors since they
// are always satisfied after a greedy .* that matches all chars.
Regexp* last = re;
if (last->op() == kRegexpConcat && last->nsub() > 0) {
int idx = last->nsub() - 1;
// Skip trailing anchors ($ or \z), and empty matches left behind
// by IsAnchorEnd stripping \z -- they are always satisfied
// after .* that matches any character.
while (idx > 0 &&
(last->sub()[idx]->op() == kRegexpEndLine ||
last->sub()[idx]->op() == kRegexpEndText ||
last->sub()[idx]->op() == kRegexpEmptyMatch))
idx--;
last = last->sub()[idx];
}

// Check for optional capture wrapping.
int cap = -1;
if (last->op() == kRegexpCapture) {
cap = last->cap();
last = last->sub()[0];
// Look through another Concat if present (e.g. the capture might
// contain a concat with .* as the last element, but typically
// it's just .* directly).
if (last->op() == kRegexpConcat && last->nsub() > 0)
last = last->sub()[last->nsub() - 1];
}

// Now last should be a greedy kRegexpStar of a match-all in OneLine mode.
// Non-greedy .* doesn't necessarily consume all remaining text.
// Without OneLine, ^ and $ are line-oriented and the optimization
// may produce incorrect results for multiline patterns.
if (last->op() == kRegexpStar && last->nsub() == 1 &&
(last->parse_flags() & Regexp::OneLine) &&
!(last->parse_flags() & Regexp::NonGreedy) &&
last->sub()[0]->op() == kRegexpAnyChar) {
prog->set_has_trailing_match_all(true);
prog->set_trailing_match_all_cap(cap);
}
}

void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
RE2::Anchor anchor) {
if (flags & Regexp::Latin1)
Expand Down Expand Up @@ -1128,6 +1176,13 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
bool is_anchor_start = IsAnchorStart(&sre, 0);
bool is_anchor_end = IsAnchorEnd(&sre, 0);

// Detect trailing match-all (e.g. (?s:.*)$) before compilation.
// Only for the forward program. The trailing .* matches all
// remaining text regardless of whether there's an explicit end
// anchor, because it's a greedy match of all characters.
if (!reversed)
DetectTrailingMatchAll(sre, c.prog_);

// Generate fragment for entire regexp.
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
sre->Decref();
Expand Down
2 changes: 2 additions & 0 deletions re2/prog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ Prog::Prog()
reversed_(false),
did_flatten_(false),
did_onepass_(false),
has_trailing_match_all_(false),
trailing_match_all_cap_(-1),
start_(0),
start_unanchored_(0),
size_(0),
Expand Down
6 changes: 6 additions & 0 deletions re2/prog.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ class Prog {
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
bool has_trailing_match_all() { return has_trailing_match_all_; }
void set_has_trailing_match_all(bool b) { has_trailing_match_all_ = b; }
int trailing_match_all_cap() { return trailing_match_all_cap_; }
void set_trailing_match_all_cap(int cap) { trailing_match_all_cap_ = cap; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
bool can_prefix_accel() { return prefix_size_ != 0; }
Expand Down Expand Up @@ -439,6 +443,8 @@ class Prog {
bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
bool has_trailing_match_all_; // pattern ends with .* (any-char star)
int trailing_match_all_cap_; // capture index wrapping trailing .*, or -1

int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
Expand Down
119 changes: 119 additions & 0 deletions re2/re2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,78 @@ namespace re2 {
// Controls the maximum count permitted by GlobalReplace(); -1 is unlimited.
static int maximum_global_replace_count = -1;

// Strips a trailing greedy match-all (.* where . is kRegexpAnyChar)
// from the Regexp tree *pre. Looks through trailing anchors ($, \z),
// empty matches, and optional capture groups. Returns true if stripping
// occurred. Only strips greedy stars in OneLine mode, consistent with
// DetectTrailingMatchAll in compile.cc.
// Caller owns the resulting *pre (which has been Incref'd as needed).
static bool StripTrailingMatchAll(Regexp** pre) {
Regexp* re = *pre;
if (re == NULL)
return false;
switch (re->op()) {
default:
break;
case kRegexpConcat:
if (re->nsub() > 0) {
// Find last non-anchor/non-empty child.
int last = re->nsub() - 1;
while (last > 0 &&
(re->sub()[last]->op() == kRegexpEndLine ||
re->sub()[last]->op() == kRegexpEndText ||
re->sub()[last]->op() == kRegexpEmptyMatch))
last--;
Regexp* sub = re->sub()[last]->Incref();
if (StripTrailingMatchAll(&sub)) {
// Rebuild concat: children 0..last-1, replace last with sub
// (skip sub if it's just an empty placeholder).
std::vector<Regexp*> subcopy;
for (int i = 0; i < last; i++)
subcopy.push_back(re->sub()[i]->Incref());
if (sub->op() != kRegexpEmptyMatch)
subcopy.push_back(sub);
else
sub->Decref();
if (subcopy.empty()) {
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
} else {
*pre = Regexp::Concat(subcopy.data(),
static_cast<int>(subcopy.size()),
re->parse_flags());
}
re->Decref();
return true;
}
sub->Decref();
}
break;
case kRegexpCapture: {
Regexp* sub = re->sub()[0]->Incref();
if (StripTrailingMatchAll(&sub)) {
*pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
re->Decref();
return true;
}
sub->Decref();
break;
}
case kRegexpStar: {
// Only strip greedy stars in OneLine mode — must match the
// conditions in DetectTrailingMatchAll (compile.cc).
if ((re->parse_flags() & Regexp::OneLine) &&
!(re->parse_flags() & Regexp::NonGreedy) &&
re->sub()[0]->op() == kRegexpAnyChar) {
*pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
re->Decref();
return true;
}
break;
}
}
return false;
}

void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) {
maximum_global_replace_count = i;
}
Expand Down Expand Up @@ -227,6 +299,8 @@ void RE2::Init(absl::string_view pattern, const Options& options) {
prefix_foldcase_ = false;
prefix_.clear();
prog_ = NULL;
prefix_match_ = NULL;
trailing_match_all_cap_ = -1;

rprog_ = NULL;
named_groups_ = NULL;
Expand Down Expand Up @@ -281,6 +355,21 @@ void RE2::Init(absl::string_view pattern, const Options& options) {
// and that is harder to do if the DFA has already
// been built.
is_one_pass_ = prog_->IsOnePass();

// If the pattern ends with a trailing match-all (e.g. (?s:.*)$),
// build a prefix RE2 with the .* stripped for fast matching.
// Detection runs on the simplified tree, so strip from that too.
if (prog_->has_trailing_match_all()) {
trailing_match_all_cap_ = prog_->trailing_match_all_cap();
re2::Regexp* stripped = entire_regexp_->Incref();
if (StripTrailingMatchAll(&stripped)) {
std::string stripped_pattern = stripped->ToString();
stripped->Decref();
prefix_match_ = new RE2(stripped_pattern, options_);
} else {
stripped->Decref();
}
}
}

// Returns rprog_, computing it if needed.
Expand All @@ -303,6 +392,7 @@ re2::Prog* RE2::ReverseProg() const {
}

RE2::~RE2() {
delete prefix_match_;
if (group_names_ != empty_group_names())
delete group_names_;
if (named_groups_ != empty_named_groups())
Expand Down Expand Up @@ -676,6 +766,35 @@ bool RE2::Match(absl::string_view text,
return false;
}

// Fast path: if we have a prefix RE2 (trailing match-all was stripped),
// delegate to it and extend the results to cover all remaining text.
if (prefix_match_ != NULL && prefix_match_->ok()) {
// The prefix RE2 doesn't have the trailing .*, so for ANCHOR_BOTH
// (full match), downgrade to ANCHOR_START — the .* would have
// consumed everything to end-of-text.
Anchor prefix_anchor = re_anchor;
if (prefix_anchor == ANCHOR_BOTH)
prefix_anchor = ANCHOR_START;
if (!prefix_match_->Match(text, startpos, endpos, prefix_anchor,
submatch, nsubmatch))
return false;
if (nsubmatch > 0) {
// Extend overall match to endpos.
submatch[0] = absl::string_view(
submatch[0].data(),
static_cast<size_t>(text.data() + endpos - submatch[0].data()));
// Extend trailing capture group if present.
int cap = trailing_match_all_cap_;
if (cap >= 0 && cap < nsubmatch &&
submatch[cap].data() != nullptr)
submatch[cap] = absl::string_view(
submatch[cap].data(),
static_cast<size_t>(text.data() + endpos -
submatch[cap].data()));
}
return true;
}

absl::string_view subtext = text;
subtext.remove_prefix(startpos);
subtext.remove_suffix(text.size() - endpos);
Expand Down
6 changes: 6 additions & 0 deletions re2/re2.h
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,12 @@ class RE2 {
std::string prefix_; // required prefix (before suffix_regexp_)
re2::Prog* prog_; // compiled program for regexp

// If the pattern ends with a trailing match-all (e.g. (?s:.*)$),
// this is a compiled RE2 for the prefix (with .* stripped).
// Match delegates to this and extends the results to end-of-text.
RE2* prefix_match_;
int trailing_match_all_cap_; // capture index wrapping .*, or -1

// Reverse Prog for DFA execution only
mutable re2::Prog* rprog_;
// Map from capture names to indices
Expand Down
Loading