From cf5388082266e93afe3b3e9688b1bac27319eb89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kim=20Gr=C3=A4sman?= Date: Fri, 2 Sep 2022 20:55:48 +0200 Subject: [PATCH] Add --regex option As reported in issue #981, using std::regex in IWYU has caused a tremendous performance regression for large mapping files containing regex mappings. $ cat t.cc #include # with llvm::Regex $ time include-what-you-use -Xiwyu --mapping_file=qt5_11.imp t.cc ... real 0m0,529s user 0m0,509s sys 0m0,020s # with std::regex $ time include-what-you-use -Xiwyu --mapping_file=qt5_11.imp t.cc ... real 0m29,870s user 0m29,717s sys 0m0,012s qt5_11.imp contains 2300+ regex mappings, and has a bunch of includes, so this is a good testbed for regular expression engines, but over 50x slower is not the result we were hoping for. The reason we switched to std::regex was to get support for negative lookaround (llvm::Regex does not have it), but exotic regexes in mappings are pretty rare, and this is a significant performance hit. Introduce a --regex option to select regex dialect, with documented tradeoffs. Put the default back to LLVM's fast implementation. This fixes issue #981. --- include-what-you-use.1 | 14 ++++++++++++++ iwyu_globals.cc | 22 ++++++++++++++++++---- iwyu_globals.h | 2 ++ iwyu_include_picker.cc | 11 +++++++---- iwyu_include_picker.h | 6 +++++- iwyu_regex.cc | 32 +++++++++++++++++++++++++++++--- iwyu_regex.h | 10 ++++++++-- 7 files changed, 83 insertions(+), 14 deletions(-) diff --git a/include-what-you-use.1 b/include-what-you-use.1 index 3132d92..44a0eda 100644 --- a/include-what-you-use.1 +++ b/include-what-you-use.1 @@ -134,6 +134,20 @@ No new includes are added, existing ones are removed. .B \-\-quoted_includes_first When sorting includes, place quoted includes first. .TP +.BI \-\-regex= dialect +Use the specified regex +.IR dialect +for matching in IWYU: +.RS +.TP +.B llvm +Fast and simple extended regular expressions. This is the default. +.TP +.B ecmascript +Slower but more full-featured regular expressions with support for lookaround +assertions, etc. +.RE +.TP .B \-\-transitive_includes_only Do not suggest that a file should add .IR foo.h " unless " foo.h diff --git a/iwyu_globals.cc b/iwyu_globals.cc index 91730c5..574f618 100644 --- a/iwyu_globals.cc +++ b/iwyu_globals.cc @@ -24,6 +24,7 @@ #include "iwyu_location_util.h" #include "iwyu_path_util.h" #include "iwyu_port.h" // for CHECK_, etc +#include "iwyu_regex.h" #include "iwyu_stl_util.h" #include "iwyu_string_util.h" #include "iwyu_verrs.h" @@ -108,6 +109,9 @@ static void PrintHelp(const char* extra_msg) { " --error_always[=N]: always exit with N (default: 1) (for use\n" " with 'make -k')\n" " --debug=flag[,flag...]: debug flags (undocumented)\n" + " --regex=: use specified regex dialect in IWYU:\n" + " llvm: fast and simple (default)\n" + " ecmascript: slower, but more feature-complete\n" "\n" "In addition to IWYU-specific options you can specify the following\n" "options without -Xiwyu prefix:\n" @@ -197,7 +201,8 @@ CommandlineFlags::CommandlineFlags() quoted_includes_first(false), cxx17ns(false), exit_code_error(EXIT_SUCCESS), - exit_code_always(EXIT_SUCCESS) { + exit_code_always(EXIT_SUCCESS), + regex_dialect(RegexDialect::LLVM) { // Always keep Qt .moc includes; its moc compiler does its own IWYU analysis. keep.emplace("*.moc"); } @@ -222,9 +227,10 @@ int CommandlineFlags::ParseArgv(int argc, char** argv) { {"error", optional_argument, nullptr, 'e'}, {"error_always", optional_argument, nullptr, 'a'}, {"debug", required_argument, nullptr, 'd'}, + {"regex", required_argument, nullptr, 'r'}, {nullptr, 0, nullptr, 0} }; - static const char shortopts[] = "v:c:m:d:n"; + static const char shortopts[] = "v:c:m:d:nr"; while (true) { switch (getopt_long(argc, argv, shortopts, longopts, nullptr)) { case 'c': AddGlobToReportIWYUViolationsFor(optarg); break; @@ -296,6 +302,12 @@ int CommandlineFlags::ParseArgv(int argc, char** argv) { } break; } + case 'r': + if (!ParseRegexDialect(optarg, ®ex_dialect)) { + PrintHelp("FATAL ERROR: unsupported regex dialect."); + exit(EXIT_FAILURE); + } + break; case -1: return optind; // means 'no more input' default: PrintHelp("FATAL ERROR: unknown flag."); @@ -401,7 +413,8 @@ void InitGlobals(clang::SourceManager* sm, clang::HeaderSearch* header_search) { vector search_paths = ComputeHeaderSearchPaths(header_search); SetHeaderSearchPaths(search_paths); - include_picker = new IncludePicker(GlobalFlags().no_default_mappings); + include_picker = new IncludePicker(GlobalFlags().no_default_mappings, + GlobalFlags().regex_dialect); function_calls_full_use_cache = new FullUseCache; class_members_full_use_cache = new FullUseCache; @@ -494,7 +507,8 @@ void InitGlobalsAndFlagsForTesting() { commandline_flags = new CommandlineFlags; source_manager = nullptr; data_getter = nullptr; - include_picker = new IncludePicker(GlobalFlags().no_default_mappings); + include_picker = new IncludePicker(GlobalFlags().no_default_mappings, + GlobalFlags().regex_dialect); function_calls_full_use_cache = new FullUseCache; class_members_full_use_cache = new FullUseCache; diff --git a/iwyu_globals.h b/iwyu_globals.h index 818dbfc..cbdfff1 100644 --- a/iwyu_globals.h +++ b/iwyu_globals.h @@ -28,6 +28,7 @@ using std::set; using std::string; using std::vector; +enum class RegexDialect; class FullUseCache; class IncludePicker; class SourceManagerCharacterDataGetter; @@ -98,6 +99,7 @@ struct CommandlineFlags { int exit_code_error; // Exit with this code for iwyu violations. int exit_code_always; // Always exit with this exit code. set dbg_flags; // Debug flags. + RegexDialect regex_dialect; // Dialect for regular expression processing. }; const CommandlineFlags& GlobalFlags(); diff --git a/iwyu_include_picker.cc b/iwyu_include_picker.cc index ce9288a..411ded3 100644 --- a/iwyu_include_picker.cc +++ b/iwyu_include_picker.cc @@ -1231,8 +1231,10 @@ bool MappedInclude::HasAbsoluteQuotedInclude() const { return IsAbsolutePath(path); } -IncludePicker::IncludePicker(bool no_default_mappings) - : has_called_finalize_added_include_lines_(false) { +IncludePicker::IncludePicker(bool no_default_mappings, + RegexDialect regex_dialect) + : has_called_finalize_added_include_lines_(false), + regex_dialect(regex_dialect) { if (!no_default_mappings) { AddDefaultMappings(); } @@ -1437,7 +1439,8 @@ void IncludePicker::ExpandRegexes() { const vector& map_to = filepath_include_map_[regex_key]; // Enclose the regex in ^(...)$ for full match. std::string regex("^(" + regex_key.substr(1) + ")$"); - if (RegexMatch(hdr, regex) && !ContainsQuotedInclude(map_to, hdr)) { + if (RegexMatch(regex_dialect, hdr, regex) && + !ContainsQuotedInclude(map_to, hdr)) { Extend(&filepath_include_map_[hdr], filepath_include_map_[regex_key]); MarkVisibility(&include_visibility_map_, hdr, include_visibility_map_[regex_key]); @@ -1445,7 +1448,7 @@ void IncludePicker::ExpandRegexes() { } for (const string& regex_key : friend_to_headers_map_regex_keys) { std::string regex("^(" + regex_key.substr(1) + ")$"); - if (RegexMatch(hdr, regex)) { + if (RegexMatch(regex_dialect, hdr, regex)) { InsertAllInto(friend_to_headers_map_[regex_key], &friend_to_headers_map_[hdr]); } diff --git a/iwyu_include_picker.h b/iwyu_include_picker.h index 5a1f1f0..ccf9177 100644 --- a/iwyu_include_picker.h +++ b/iwyu_include_picker.h @@ -65,6 +65,7 @@ using std::vector; struct IncludeMapEntry; +enum class RegexDialect; enum IncludeVisibility { kUnusedVisibility, kPublic, kPrivate }; // When a symbol or file is mapped to an include, that include is represented @@ -91,7 +92,7 @@ class IncludePicker { // visibility of the respective files. typedef map VisibilityMap; - explicit IncludePicker(bool no_default_mappings); + IncludePicker(bool no_default_mappings, RegexDialect regex_dialect); // ----- Routines to dynamically modify the include-picker @@ -291,6 +292,9 @@ class IncludePicker { // Make sure we don't do any non-const operations after finalizing. bool has_called_finalize_added_include_lines_; + + // Controls regex dialect to use for mappings. + RegexDialect regex_dialect; }; // class IncludePicker } // namespace include_what_you_use diff --git a/iwyu_regex.cc b/iwyu_regex.cc index bc52f6b..dca92be 100644 --- a/iwyu_regex.cc +++ b/iwyu_regex.cc @@ -10,12 +10,38 @@ #include "iwyu_regex.h" #include +#include "llvm/Support/Regex.h" + +#include "iwyu_port.h" namespace include_what_you_use { -bool RegexMatch(const std::string& str, const std::string& pattern) { - std::regex r(pattern); - return std::regex_match(str, r); +bool ParseRegexDialect(const char* str, RegexDialect* dialect) { + if (strcmp(str, "llvm") == 0) { + *dialect = RegexDialect::LLVM; + return true; + } else if (strcmp(str, "ecmascript") == 0) { + *dialect = RegexDialect::ECMAScript; + return true; + } + return false; +} + +// Returns true if str matches regular expression pattern for the given dialect. +bool RegexMatch(RegexDialect dialect, const std::string& str, + const std::string& pattern) { + switch (dialect) { + case RegexDialect::LLVM: { + llvm::Regex r(pattern); + return r.match(str); + } + + case RegexDialect::ECMAScript: { + std::regex r(pattern, std::regex_constants::ECMAScript); + return std::regex_match(str, r); + } + } + CHECK_UNREACHABLE_("Unexpected regex dialect"); } } // namespace include_what_you_use diff --git a/iwyu_regex.h b/iwyu_regex.h index 107730e..63b4435 100644 --- a/iwyu_regex.h +++ b/iwyu_regex.h @@ -14,8 +14,14 @@ namespace include_what_you_use { -// Returns true if str matches regular expression pattern. -bool RegexMatch(const std::string& str, const std::string& pattern); +enum class RegexDialect { LLVM = 0, ECMAScript = 1 }; + +// Parse dialect string to enum. +bool ParseRegexDialect(const char* str, RegexDialect* dialect); + +// Returns true if str matches regular expression pattern for the given dialect. +bool RegexMatch(RegexDialect dialect, const std::string& str, + const std::string& pattern); } // namespace include_what_you_use