Add --regex option

As reported in issue #981, using std::regex in IWYU has caused a
tremendous performance regression for large mapping files containing
regex mappings.

  $ cat t.cc
  #include <string>

  # with llvm::Regex
  $ time include-what-you-use -Xiwyu --mapping_file=qt5_11.imp t.cc
  ...
  real 0m0,529s
  user 0m0,509s
  sys  0m0,020s

  # with std::regex
  $ time include-what-you-use -Xiwyu --mapping_file=qt5_11.imp t.cc
  ...
  real 0m29,870s
  user 0m29,717s
  sys  0m0,012s

qt5_11.imp contains 2300+ regex mappings, and <string> has a bunch of
includes, so this is a good testbed for regular expression engines, but
over 50x slower is not the result we were hoping for.

The reason we switched to std::regex was to get support for negative
lookaround (llvm::Regex does not have it), but exotic regexes in
mappings are pretty rare, and this is a significant performance hit.

Introduce a --regex option to select regex dialect, with documented
tradeoffs. Put the default back to LLVM's fast implementation.

This fixes issue #981.
This commit is contained in:
Kim Gräsman 2022-09-02 20:55:48 +02:00
parent 0a714265c0
commit cf53880822
7 changed files with 83 additions and 14 deletions

View File

@ -134,6 +134,20 @@ No new includes are added, existing ones are removed.
.B \-\-quoted_includes_first
When sorting includes, place quoted includes first.
.TP
.BI \-\-regex= dialect
Use the specified regex
.IR dialect
for matching in IWYU:
.RS
.TP
.B llvm
Fast and simple extended regular expressions. This is the default.
.TP
.B ecmascript
Slower but more full-featured regular expressions with support for lookaround
assertions, etc.
.RE
.TP
.B \-\-transitive_includes_only
Do not suggest that a file should add
.IR foo.h " unless " foo.h

View File

@ -24,6 +24,7 @@
#include "iwyu_location_util.h"
#include "iwyu_path_util.h"
#include "iwyu_port.h" // for CHECK_, etc
#include "iwyu_regex.h"
#include "iwyu_stl_util.h"
#include "iwyu_string_util.h"
#include "iwyu_verrs.h"
@ -108,6 +109,9 @@ static void PrintHelp(const char* extra_msg) {
" --error_always[=N]: always exit with N (default: 1) (for use\n"
" with 'make -k')\n"
" --debug=flag[,flag...]: debug flags (undocumented)\n"
" --regex=<dialect>: use specified regex dialect in IWYU:\n"
" llvm: fast and simple (default)\n"
" ecmascript: slower, but more feature-complete\n"
"\n"
"In addition to IWYU-specific options you can specify the following\n"
"options without -Xiwyu prefix:\n"
@ -197,7 +201,8 @@ CommandlineFlags::CommandlineFlags()
quoted_includes_first(false),
cxx17ns(false),
exit_code_error(EXIT_SUCCESS),
exit_code_always(EXIT_SUCCESS) {
exit_code_always(EXIT_SUCCESS),
regex_dialect(RegexDialect::LLVM) {
// Always keep Qt .moc includes; its moc compiler does its own IWYU analysis.
keep.emplace("*.moc");
}
@ -222,9 +227,10 @@ int CommandlineFlags::ParseArgv(int argc, char** argv) {
{"error", optional_argument, nullptr, 'e'},
{"error_always", optional_argument, nullptr, 'a'},
{"debug", required_argument, nullptr, 'd'},
{"regex", required_argument, nullptr, 'r'},
{nullptr, 0, nullptr, 0}
};
static const char shortopts[] = "v:c:m:d:n";
static const char shortopts[] = "v:c:m:d:nr";
while (true) {
switch (getopt_long(argc, argv, shortopts, longopts, nullptr)) {
case 'c': AddGlobToReportIWYUViolationsFor(optarg); break;
@ -296,6 +302,12 @@ int CommandlineFlags::ParseArgv(int argc, char** argv) {
}
break;
}
case 'r':
if (!ParseRegexDialect(optarg, &regex_dialect)) {
PrintHelp("FATAL ERROR: unsupported regex dialect.");
exit(EXIT_FAILURE);
}
break;
case -1: return optind; // means 'no more input'
default:
PrintHelp("FATAL ERROR: unknown flag.");
@ -401,7 +413,8 @@ void InitGlobals(clang::SourceManager* sm, clang::HeaderSearch* header_search) {
vector<HeaderSearchPath> search_paths =
ComputeHeaderSearchPaths(header_search);
SetHeaderSearchPaths(search_paths);
include_picker = new IncludePicker(GlobalFlags().no_default_mappings);
include_picker = new IncludePicker(GlobalFlags().no_default_mappings,
GlobalFlags().regex_dialect);
function_calls_full_use_cache = new FullUseCache;
class_members_full_use_cache = new FullUseCache;
@ -494,7 +507,8 @@ void InitGlobalsAndFlagsForTesting() {
commandline_flags = new CommandlineFlags;
source_manager = nullptr;
data_getter = nullptr;
include_picker = new IncludePicker(GlobalFlags().no_default_mappings);
include_picker = new IncludePicker(GlobalFlags().no_default_mappings,
GlobalFlags().regex_dialect);
function_calls_full_use_cache = new FullUseCache;
class_members_full_use_cache = new FullUseCache;

View File

@ -28,6 +28,7 @@ using std::set;
using std::string;
using std::vector;
enum class RegexDialect;
class FullUseCache;
class IncludePicker;
class SourceManagerCharacterDataGetter;
@ -98,6 +99,7 @@ struct CommandlineFlags {
int exit_code_error; // Exit with this code for iwyu violations.
int exit_code_always; // Always exit with this exit code.
set<string> dbg_flags; // Debug flags.
RegexDialect regex_dialect; // Dialect for regular expression processing.
};
const CommandlineFlags& GlobalFlags();

View File

@ -1231,8 +1231,10 @@ bool MappedInclude::HasAbsoluteQuotedInclude() const {
return IsAbsolutePath(path);
}
IncludePicker::IncludePicker(bool no_default_mappings)
: has_called_finalize_added_include_lines_(false) {
IncludePicker::IncludePicker(bool no_default_mappings,
RegexDialect regex_dialect)
: has_called_finalize_added_include_lines_(false),
regex_dialect(regex_dialect) {
if (!no_default_mappings) {
AddDefaultMappings();
}
@ -1437,7 +1439,8 @@ void IncludePicker::ExpandRegexes() {
const vector<MappedInclude>& map_to = filepath_include_map_[regex_key];
// Enclose the regex in ^(...)$ for full match.
std::string regex("^(" + regex_key.substr(1) + ")$");
if (RegexMatch(hdr, regex) && !ContainsQuotedInclude(map_to, hdr)) {
if (RegexMatch(regex_dialect, hdr, regex) &&
!ContainsQuotedInclude(map_to, hdr)) {
Extend(&filepath_include_map_[hdr], filepath_include_map_[regex_key]);
MarkVisibility(&include_visibility_map_, hdr,
include_visibility_map_[regex_key]);
@ -1445,7 +1448,7 @@ void IncludePicker::ExpandRegexes() {
}
for (const string& regex_key : friend_to_headers_map_regex_keys) {
std::string regex("^(" + regex_key.substr(1) + ")$");
if (RegexMatch(hdr, regex)) {
if (RegexMatch(regex_dialect, hdr, regex)) {
InsertAllInto(friend_to_headers_map_[regex_key],
&friend_to_headers_map_[hdr]);
}

View File

@ -65,6 +65,7 @@ using std::vector;
struct IncludeMapEntry;
enum class RegexDialect;
enum IncludeVisibility { kUnusedVisibility, kPublic, kPrivate };
// When a symbol or file is mapped to an include, that include is represented
@ -91,7 +92,7 @@ class IncludePicker {
// visibility of the respective files.
typedef map<string, IncludeVisibility> VisibilityMap;
explicit IncludePicker(bool no_default_mappings);
IncludePicker(bool no_default_mappings, RegexDialect regex_dialect);
// ----- Routines to dynamically modify the include-picker
@ -291,6 +292,9 @@ class IncludePicker {
// Make sure we don't do any non-const operations after finalizing.
bool has_called_finalize_added_include_lines_;
// Controls regex dialect to use for mappings.
RegexDialect regex_dialect;
}; // class IncludePicker
} // namespace include_what_you_use

View File

@ -10,12 +10,38 @@
#include "iwyu_regex.h"
#include <regex>
#include "llvm/Support/Regex.h"
#include "iwyu_port.h"
namespace include_what_you_use {
bool RegexMatch(const std::string& str, const std::string& pattern) {
std::regex r(pattern);
return std::regex_match(str, r);
bool ParseRegexDialect(const char* str, RegexDialect* dialect) {
if (strcmp(str, "llvm") == 0) {
*dialect = RegexDialect::LLVM;
return true;
} else if (strcmp(str, "ecmascript") == 0) {
*dialect = RegexDialect::ECMAScript;
return true;
}
return false;
}
// Returns true if str matches regular expression pattern for the given dialect.
bool RegexMatch(RegexDialect dialect, const std::string& str,
const std::string& pattern) {
switch (dialect) {
case RegexDialect::LLVM: {
llvm::Regex r(pattern);
return r.match(str);
}
case RegexDialect::ECMAScript: {
std::regex r(pattern, std::regex_constants::ECMAScript);
return std::regex_match(str, r);
}
}
CHECK_UNREACHABLE_("Unexpected regex dialect");
}
} // namespace include_what_you_use

View File

@ -14,8 +14,14 @@
namespace include_what_you_use {
// Returns true if str matches regular expression pattern.
bool RegexMatch(const std::string& str, const std::string& pattern);
enum class RegexDialect { LLVM = 0, ECMAScript = 1 };
// Parse dialect string to enum.
bool ParseRegexDialect(const char* str, RegexDialect* dialect);
// Returns true if str matches regular expression pattern for the given dialect.
bool RegexMatch(RegexDialect dialect, const std::string& str,
const std::string& pattern);
} // namespace include_what_you_use