From 460e046e9921bbbf35841a908cfbcc034d17375d Mon Sep 17 00:00:00 2001 From: Nate Matykiewicz Date: Wed, 22 May 2024 16:43:48 -0500 Subject: [PATCH] Improve performance of finding indexables Currently, all folders and files in the current tree are turned into IndexablePath, and then excluded files are filtered out after. When there are large file trees that are meant to be excluded, this results in a lot of unnecessary work. ActiveStorage stores files in the `tmp` directory in many many small folders. So does Bootsnap. Ruby LSP has to traverse all of these files, even though the entire directory should just be ignored. Rubocop has solved this by breaking the `includes` patterns up into many patterns, applying the exclusions *before* the `Dir.glob`, so I followed in their footsteps. This works great for exclusions that end in "**/*". We still need to loop through all IndexablePath objects and see if they're excluded, in the case that an extension was provided on the excluded path, but this can cut down load time dramatically. Before this PR in my Rails app, `indexables` took 76 seconds to run. Now it takes, 0.17 seconds. Before and after code both return the same exact file list. --- .../lib/ruby_indexer/configuration.rb | 79 +++++++++++++++---- lib/ruby_indexer/test/configuration_test.rb | 9 ++- 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/lib/ruby_indexer/lib/ruby_indexer/configuration.rb b/lib/ruby_indexer/lib/ruby_indexer/configuration.rb index a3377fbf8..4cd21f866 100644 --- a/lib/ruby_indexer/lib/ruby_indexer/configuration.rb +++ b/lib/ruby_indexer/lib/ruby_indexer/configuration.rb @@ -20,9 +20,16 @@ class Configuration def initialize @excluded_gems = T.let(initial_excluded_gems, T::Array[String]) @included_gems = T.let([], T::Array[String]) - @excluded_patterns = T.let([File.join("**", "*_test.rb"), File.join("**", "tmp", "**", "*")], T::Array[String]) + @excluded_patterns = T.let( + [ + File.join("**", "*_test.rb"), + File.join("**", "tmp", "**", "*"), + File.join("**", "node_modules", "**", "*"), + ], + T::Array[String], + ) path = Bundler.settings["path"] - @excluded_patterns << File.join(File.expand_path(path, Dir.pwd), "**", "*.rb") if path + @excluded_patterns << File.join(File.expand_path(path, Dir.pwd), "**", "*") if path @included_patterns = T.let([File.join(Dir.pwd, "**", "*.rb")], T::Array[String]) @excluded_magic_comments = T.let( @@ -43,6 +50,15 @@ def initialize ) end + sig { returns(String) } + def exclude_pattern + relative_patterns = @excluded_patterns + .select { |p| p.end_with?("/**/*") } + .map { |p| p.delete_prefix("#{Dir.pwd}/") } + + "#{Dir.pwd}/{#{relative_patterns.join(",")}}" + end + sig { returns(T::Array[IndexablePath]) } def indexables excluded_gems = @excluded_gems - @included_gems @@ -52,21 +68,25 @@ def indexables # having duplicates if BUNDLE_PATH is set to a folder inside the project structure # Add user specified patterns - indexables = @included_patterns.flat_map do |pattern| + patterns = indexable_included_file_patterns.sort.map! { |dir| File.join(dir, "*.rb") } + patterns = [File.join(Dir.pwd, "**/*.rb")] if patterns.empty? + + indexables = patterns.flat_map do |pattern| load_path_entry = T.let(nil, T.nilable(String)) - Dir.glob(pattern, File::FNM_PATHNAME | File::FNM_EXTGLOB).map! do |path| - path = File.expand_path(path) - # All entries for the same pattern match the same $LOAD_PATH entry. Since searching the $LOAD_PATH for every - # entry is expensive, we memoize it until we find a path that doesn't belong to that $LOAD_PATH. This happens - # on repositories that define multiple gems, like Rails. All frameworks are defined inside the Dir.pwd, but - # each one of them belongs to a different $LOAD_PATH entry - if load_path_entry.nil? || !path.start_with?(load_path_entry) - load_path_entry = $LOAD_PATH.find { |load_path| path.start_with?(load_path) } + Dir.glob(pattern, File::FNM_PATHNAME | File::FNM_EXTGLOB) + .map! do |path| + path = File.expand_path(path) + # All entries for the same pattern match the same $LOAD_PATH entry. Since searching the $LOAD_PATH for every + # entry is expensive, we memoize it until we find a path that doesn't belong to that $LOAD_PATH. This + # happens on repositories that define multiple gems, like Rails. All frameworks are defined inside the + # Dir.pwd, but each one of them belongs to a different $LOAD_PATH entry + if load_path_entry.nil? || !path.start_with?(load_path_entry) + load_path_entry = $LOAD_PATH.find { |load_path| path.start_with?(load_path) } + end + + IndexablePath.new(load_path_entry, path) end - - IndexablePath.new(load_path_entry, path) - end end # Remove user specified patterns @@ -158,6 +178,37 @@ def apply_config(config) private + sig { params(base_directory: String, excluded_pattern: String).returns(T::Array[String]) } + def indexable_included_file_patterns(base_directory = Dir.pwd, excluded_pattern = exclude_pattern) + flags = File::FNM_PATHNAME | File::FNM_EXTGLOB + + # Escape File.fnmatch? wildcards in the directory + base_directory = base_directory.gsub(/[\\\{\}\[\]\*\?]/) do |reserved_glob_character| + "\\#{reserved_glob_character}" + end + + dirs = Dir.glob(File.join(base_directory, "*/"), flags) + .reject do |dir| + next true if dir.end_with?("/./", "/../") + next true if File.fnmatch?(excluded_pattern, dir, flags) + + symlink_excluded_or_infinite_loop?(base_directory, dir, excluded_pattern, flags) + end + + dirs + .flat_map { |dir| indexable_included_file_patterns(dir, excluded_pattern) } + .unshift(base_directory) + end + + sig { params(base_dir: String, current_dir: String, exclude_pattern: String, flags: Integer).returns(T::Boolean) } + def symlink_excluded_or_infinite_loop?(base_dir, current_dir, exclude_pattern, flags) + dir_realpath = File.realpath(current_dir) + File.symlink?(current_dir.chomp("/")) && ( + File.fnmatch?(exclude_pattern, "#{dir_realpath}/", flags) || + File.realpath(base_dir).start_with?(dir_realpath) + ) + end + sig { params(config: T::Hash[String, T.untyped]).void } def validate_config!(config) errors = config.filter_map do |key, value| diff --git a/lib/ruby_indexer/test/configuration_test.rb b/lib/ruby_indexer/test/configuration_test.rb index 05609f77a..6c5d2cf67 100644 --- a/lib/ruby_indexer/test/configuration_test.rb +++ b/lib/ruby_indexer/test/configuration_test.rb @@ -49,6 +49,13 @@ def test_indexables_does_not_include_default_gem_path_when_in_bundle ) end + def test_exclude_pattern + assert_equal( + @config.exclude_pattern, + "#{Dir.pwd}/{**/tmp/**/*,**/node_modules/**/*,vendor/bundle/**/*}", + ) + end + def test_indexables_includes_default_gems indexables = @config.indexables.map(&:full_path) @@ -71,7 +78,7 @@ def test_indexables_avoids_duplicates_if_bundle_path_is_inside_project Bundler.settings.temporary(path: "vendor/bundle") do config = Configuration.new - assert_includes(config.instance_variable_get(:@excluded_patterns), "#{Dir.pwd}/vendor/bundle/**/*.rb") + assert_includes(config.instance_variable_get(:@excluded_patterns), "#{Dir.pwd}/vendor/bundle/**/*") end end