parallelize POSIX dyld RefreshModules

Tom Yang · Tom Yang · commit d0f9fadf068e · 2025-03-12T00:32:48.000-07:00
This diff parallelizes `DynamicLoaderPOSIXDYLD::RefreshModules`, which speeds up module loading on Linux. The major benefit of this is we can speed up symbol table indexing and parsing, which is the biggest bottleneck for targets which dynamically link many shared libraries.

This speedup is only noticeable when **preloading** symbols. This is when `target.preload-symbols` is `true`, which is the default Meta. The symbol preload option tells the debugger to fully load all of the symbol tables when modules are loaded, as opposed to lazily loading when symbols are requested.

Initially, I discovered the specific bottleneck by using the Linux `perf` tool. I saw that ~93% of samples were in `RefreshModules`, and mainly in `LoadModuleAtAddress` and `PreloadSymbols`.

`LoadModuleAtAddress` appears independent and parallelize-able at first. The main issue is `DynamicLoaderPOSIXDYLD` maintains a map of loaded modules to their link addresses via `m_loaded_modules`. Modifying and reading to this map isn't thread-safe, so this diff also includes accessor methods that protect the map in the multithreaded context. Luckily, the critical section of modifying or reading from the map isn't super costly, so the contention doesn't appear to negatively impact performance.

I tested with some larger projects with up to 15000 modules, and
found significant performance improvements. Typically, I was seeing 2-3X
launch speed increases, where "launch speed" is starting the binary and
reaching `main`.

I manually ran `ninja check-lldb` several times, and compared with the baseline. At this point, we're not seeing any new failures or new unresolved tests.
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp
@@ -10,6 +10,7 @@
 #include "DynamicLoaderPOSIXDYLD.h"
 
 #include "lldb/Breakpoint/BreakpointLocation.h"
+#include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
@@ -25,6 +26,7 @@
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/ProcessInfo.h"
+#include "llvm/Support/ThreadPool.h"
 
 #include <memory>
 #include <optional>
@@ -231,16 +233,37 @@ void DynamicLoaderPOSIXDYLD::DidLaunch() {
 
 Status DynamicLoaderPOSIXDYLD::CanLoadImage() { return Status(); }
 
+void DynamicLoaderPOSIXDYLD::SetLoadedModule(const ModuleSP &module_sp,
+                                             addr_t link_map_addr) {
+  std::unique_lock<std::shared_mutex> lock(m_loaded_modules_rw_mutex);
+  m_loaded_modules[module_sp] = link_map_addr;
+}
+
+void DynamicLoaderPOSIXDYLD::UnloadModule(const ModuleSP &module_sp) {
+  std::unique_lock<std::shared_mutex> lock(m_loaded_modules_rw_mutex);
+  m_loaded_modules.erase(module_sp);
+}
+
+std::optional<lldb::addr_t>
+DynamicLoaderPOSIXDYLD::GetLoadedModuleLinkAddr(const ModuleSP &module_sp) {
+  std::shared_lock<std::shared_mutex> lock(m_loaded_modules_rw_mutex);
+  auto it = m_loaded_modules.find(module_sp);
+  if (it != m_loaded_modules.end())
+    return it->second;
+  return std::nullopt;
+}
+
 void DynamicLoaderPOSIXDYLD::UpdateLoadedSections(ModuleSP module,
                                                   addr_t link_map_addr,
                                                   addr_t base_addr,
                                                   bool base_addr_is_offset) {
-  m_loaded_modules[module] = link_map_addr;
+  SetLoadedModule(module, link_map_addr);
+
   UpdateLoadedSectionsCommon(module, base_addr, base_addr_is_offset);
 }
 
 void DynamicLoaderPOSIXDYLD::UnloadSections(const ModuleSP module) {
-  m_loaded_modules.erase(module);
+  UnloadModule(module);
 
   UnloadSectionsCommon(module);
 }
@@ -448,7 +471,7 @@ void DynamicLoaderPOSIXDYLD::RefreshModules() {
   // The rendezvous class doesn't enumerate the main module, so track that
   // ourselves here.
   ModuleSP executable = GetTargetExecutable();
-  m_loaded_modules[executable] = m_rendezvous.GetLinkMapAddress();
+  SetLoadedModule(executable, m_rendezvous.GetLinkMapAddress());
 
   DYLDRendezvous::iterator I;
   DYLDRendezvous::iterator E;
@@ -470,34 +493,66 @@ void DynamicLoaderPOSIXDYLD::RefreshModules() {
       E = m_rendezvous.end();
       m_initial_modules_added = true;
     }
-    for (; I != E; ++I) {
-      // Don't load a duplicate copy of ld.so if we have already loaded it
-      // earlier in LoadInterpreterModule. If we instead loaded then unloaded it
-      // later, the section information for ld.so would be removed. That
-      // information is required for placing breakpoints on Arm/Thumb systems.
-      if ((m_interpreter_module.lock() != nullptr) &&
-          (I->base_addr == m_interpreter_base))
-        continue;
-
-      ModuleSP module_sp =
-          LoadModuleAtAddress(I->file_spec, I->link_addr, I->base_addr, true);
-      if (!module_sp.get())
-        continue;
-
-      if (module_sp->GetObjectFile()->GetBaseAddress().GetLoadAddress(
-              &m_process->GetTarget()) == m_interpreter_base) {
-        ModuleSP interpreter_sp = m_interpreter_module.lock();
-        if (m_interpreter_module.lock() == nullptr) {
-          m_interpreter_module = module_sp;
-        } else if (module_sp == interpreter_sp) {
-          // Module already loaded.
-          continue;
-        }
-      }
 
-      loaded_modules.AppendIfNeeded(module_sp);
-      new_modules.Append(module_sp);
+    std::mutex interpreter_module_mutex;
+    // We should be able to take SOEntry as reference since the data
+    // exists for the duration of this call in `m_rendezvous`.
+    auto load_module_fn =
+        [this, &loaded_modules, &new_modules,
+         &interpreter_module_mutex](const DYLDRendezvous::SOEntry &so_entry) {
+          // Don't load a duplicate copy of ld.so if we have already loaded it
+          // earlier in LoadInterpreterModule. If we instead loaded then
+          // unloaded it later, the section information for ld.so would be
+          // removed. That information is required for placing breakpoints on
+          // Arm/Thumb systems.
+          {
+            // `m_interpreter_module` may be modified by another thread at the
+            // same time, so we guard the access here.
+            std::lock_guard<std::mutex> lock(interpreter_module_mutex);
+            if ((m_interpreter_module.lock() != nullptr) &&
+                (so_entry.base_addr == m_interpreter_base))
+              return;
+          }
+
+          ModuleSP module_sp = LoadModuleAtAddress(
+              so_entry.file_spec, so_entry.link_addr, so_entry.base_addr, true);
+          if (!module_sp.get())
+            return;
+
+          {
+            // `m_interpreter_module` may be modified by another thread at the
+            // same time, so we guard the access here.
+            std::lock_guard<std::mutex> lock(interpreter_module_mutex);
+            // Set the interpreter module, if this is the interpreter.
+            if (module_sp->GetObjectFile()->GetBaseAddress().GetLoadAddress(
+                    &m_process->GetTarget()) == m_interpreter_base) {
+              ModuleSP interpreter_sp = m_interpreter_module.lock();
+              if (m_interpreter_module.lock() == nullptr) {
+                m_interpreter_module = module_sp;
+              } else if (module_sp == interpreter_sp) {
+                // Module already loaded.
+                return;
+              }
+            }
+          }
+
+          loaded_modules.AppendIfNeeded(module_sp);
+          new_modules.Append(module_sp);
+        };
+
+    // Loading modules in parallel tends to be faster, but is still unstable.
+    // Once it's stable, we can remove this setting and remove the serial
+    // approach.
+    if (GetGlobalPluginProperties().GetParallelModuleLoad()) {
+      llvm::ThreadPoolTaskGroup task_group(Debugger::GetThreadPool());
+      for (; I != E; ++I)
+        task_group.async(load_module_fn, *I);
+      task_group.wait();
+    } else {
+      for (; I != E; ++I)
+        load_module_fn(*I);
     }
+
     m_process->GetTarget().ModulesDidLoad(new_modules);
   }
 
@@ -683,7 +738,7 @@ void DynamicLoaderPOSIXDYLD::LoadAllCurrentModules() {
   // The rendezvous class doesn't enumerate the main module, so track that
   // ourselves here.
   ModuleSP executable = GetTargetExecutable();
-  m_loaded_modules[executable] = m_rendezvous.GetLinkMapAddress();
+  SetLoadedModule(executable, m_rendezvous.GetLinkMapAddress());
 
   std::vector<FileSpec> module_names;
   for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I)
@@ -775,15 +830,15 @@ DynamicLoaderPOSIXDYLD::GetThreadLocalData(const lldb::ModuleSP module_sp,
                                            const lldb::ThreadSP thread,
                                            lldb::addr_t tls_file_addr) {
   Log *log = GetLog(LLDBLog::DynamicLoader);
-  auto it = m_loaded_modules.find(module_sp);
-  if (it == m_loaded_modules.end()) {
+  std::optional<addr_t> link_map_addr_opt = GetLoadedModuleLinkAddr(module_sp);
+  if (!link_map_addr_opt.has_value()) {
     LLDB_LOGF(
         log, "GetThreadLocalData error: module(%s) not found in loaded modules",
         module_sp->GetObjectName().AsCString());
     return LLDB_INVALID_ADDRESS;
   }
 
-  addr_t link_map = it->second;
+  addr_t link_map = link_map_addr_opt.value();
   if (link_map == LLDB_INVALID_ADDRESS || link_map == 0) {
     LLDB_LOGF(log,
               "GetThreadLocalData error: invalid link map address=0x%" PRIx64,
diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h
@@ -95,10 +95,6 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader {
   /// Contains the pointer to the interpret module, if loaded.
   std::weak_ptr<lldb_private::Module> m_interpreter_module;
 
-  /// Loaded module list. (link map for each module)
-  std::map<lldb::ModuleWP, lldb::addr_t, std::owner_less<lldb::ModuleWP>>
-      m_loaded_modules;
-
   /// Returns true if the process is for a core file.
   bool IsCoreFile() const;
 
@@ -182,6 +178,19 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader {
   DynamicLoaderPOSIXDYLD(const DynamicLoaderPOSIXDYLD &) = delete;
   const DynamicLoaderPOSIXDYLD &
   operator=(const DynamicLoaderPOSIXDYLD &) = delete;
+
+  /// Loaded module list. (link map for each module)
+  /// This may be accessed in a multi-threaded context. Use the accessor methods
+  /// to access `m_loaded_modules` safely.
+  std::map<lldb::ModuleWP, lldb::addr_t, std::owner_less<lldb::ModuleWP>>
+      m_loaded_modules;
+  std::shared_mutex m_loaded_modules_rw_mutex;
+
+  void SetLoadedModule(const lldb::ModuleSP &module_sp,
+                       lldb::addr_t link_map_addr);
+  void UnloadModule(const lldb::ModuleSP &module_sp);
+  std::optional<lldb::addr_t>
+  GetLoadedModuleLinkAddr(const lldb::ModuleSP &module_sp);
 };
 
 #endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_POSIX_DYLD_DYNAMICLOADERPOSIXDYLD_H