[Support] Use atomic counter in parallelFor instead of per-task spawning (llvm#187989)

MaskRay · web-flow · commit c7b5f7c635e2 · 2026-03-24T09:47:19.000-07:00
This function is primarily used by lld and debug info tools.

Instead of pre-splitting work into up to MaxTasksPerGroup (1024) tasks
and spawning each through the Executor's mutex+condvar, use an atomic
counter for work distribution. Only ThreadCount workers are spawned;
each grabs the next chunk via atomic fetch_add.

This reduces futex calls from ~31K (glibc, release+assertions build) to
~1.4K when linking clang-14 (191MB PIE with --export-dynamic) with
`ld.lld --threads=8` (each parallelFor spawned up to 1024 tasks, each
requiring mutex lock + condvar signal).

```
                             Wall      System    futex
  glibc (assertions) before: 927ms     897ms     31K
  glibc (assertions) after:  879ms     765ms     1.4K
  mimalloc before:           872ms     694ms     25K
  mimalloc after:            830ms     661ms     1K
```
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
@@ -256,26 +256,31 @@ void llvm::parallelFor(size_t Begin, size_t End,
                        llvm::function_ref<void(size_t)> Fn) {
 #if LLVM_ENABLE_THREADS
   if (parallel::strategy.ThreadsRequested != 1) {
-    auto NumItems = End - Begin;
-    // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
-    // overhead on large inputs.
-    auto TaskSize = NumItems / parallel::detail::MaxTasksPerGroup;
-    if (TaskSize == 0)
-      TaskSize = 1;
+    size_t NumItems = End - Begin;
+    if (NumItems == 0)
+      return;
+    // Distribute work via an atomic counter shared by NumWorkers threads,
+    // keeping the task count (and thus Linux futex calls) at O(ThreadCount)
+    // For lld, per-file work is somewhat uneven, so a multipler > 1 is safer.
+    // While 2 vs 4 vs 8 makes no measurable difference, 4 is used as a
+    // reasonable default.
+    size_t NumWorkers = std::min<size_t>(NumItems, parallel::getThreadCount());
+    size_t ChunkSize = std::max(size_t(1), NumItems / (NumWorkers * 4));
+    std::atomic<size_t> Idx{Begin};
+    auto Worker = [&] {
+      while (true) {
+        size_t I = Idx.fetch_add(ChunkSize, std::memory_order_relaxed);
+        if (I >= End)
+          break;
+        size_t IEnd = std::min(I + ChunkSize, End);
+        for (; I < IEnd; ++I)
+          Fn(I);
+      }
+    };
 
     parallel::TaskGroup TG;
-    for (; Begin + TaskSize < End; Begin += TaskSize) {
-      TG.spawn([=, &Fn] {
-        for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
-          Fn(I);
-      });
-    }
-    if (Begin != End) {
-      TG.spawn([=, &Fn] {
-        for (size_t I = Begin; I != End; ++I)
-          Fn(I);
-      });
-    }
+    for (size_t I = 0; I != NumWorkers; ++I)
+      TG.spawn(Worker);
     return;
   }
 #endif