From ffbb3229a815ebba5e5dd2d3c5c050702a5951eb Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Tue, 28 Oct 2025 19:10:40 +0100
Subject: [PATCH] Change default thread pool sizes to respect processor
 affinity (#6604)

Instead of using the number of processors in the host, use the number of
processors available to the process, respecting cpu affinity
assignments. Without pthreads, fall back and use the number of
processors in the host as before.

This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or
`numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g.
like in CI jobs)
---
 docs/guide/exe_verilator.rst              | 20 +++++++++-------
 include/verilated.cpp                     | 10 ++++----
 include/verilated.h                       |  2 +-
 include/verilated_threads.cpp             | 28 +++++++----------------
 include/verilated_threads.h               |  2 --
 include/verilatedos.h                     | 10 ++++++++
 include/verilatedos_c.h                   | 28 +++++++++++++++++++++++
 nodist/install_test                       |  3 +--
 src/V3Options.cpp                         |  6 ++---
 test_regress/driver.py                    |  2 +-
 test_regress/t/t_a7_hier_block_cmake.py   |  4 ++--
 test_regress/t/t_threads_crazy_context.py |  2 +-
 12 files changed, 72 insertions(+), 45 deletions(-)
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 22ee03181..10dfcd611 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -168,10 +168,11 @@ Summary:
 
 .. option:: --build-jobs <value>
 
-   Specify the level of parallelism for :vlopt:`--build`. If zero, uses the
-   number of threads in the current hardware. Otherwise, the <value> must
-   be a positive integer specifying the maximum number of parallel build
-   jobs.
+   Specify the level of parallelism for :vlopt:`--build`.  If zero, uses the
+   number of threads available to the process, which is the number of threads
+   assigned by processor affinity (e.g. using `numactl`), or the number of
+   threads in the host hardware if unspecified.  Otherwise, the <value> must be
+   a positive integer specifying the maximum number of parallel build jobs.
 
    If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is
    used.
@@ -881,9 +882,10 @@ Summary:
    of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets
    :vlopt:`--output-groups` if isn't provided.
 
-   If zero, uses the number of threads in the current hardware. Otherwise,
-   must be a positive integer specifying the maximum number of parallel
-   build jobs.
+   If zero, uses the number of threads available to the process, which is the
+   number of threads assigned by processor affinity (e.g. using `numactl`), or
+   the number of threads in the host hardware if unspecified.  Otherwise, must
+   be a positive integer specifying the maximum number of parallel build jobs.
 
 .. option:: --no-json-edit-nums
 
@@ -1831,7 +1833,9 @@ Summary:
 .. option:: --verilate-jobs <value>
 
    Specify the level of parallelism for the internal compilation steps of
-   Verilator. If zero, uses the number of threads in the current hardware.
+   Verilator.  If zero, uses the number of threads available to the process,
+   which is the number of threads assigned by processor affinity (e.g. using
+   `numactl`), or the number of threads in the host hardware if unspecified.
    Otherwise, must be a positive integer specifying the maximum number of
    parallel build jobs.
 
diff --git a/include/verilated.cpp b/include/verilated.cpp
index d2bf2d93c..185c01838 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) {
 
     if (m_threads == n) return;  // To avoid unnecessary warnings
     m_threads = n;
-    const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency();
-    if (m_threads > hardwareThreadsAvailable) {
-        VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set "
-                     "to %u. This will likely cause significant slowdown.\n",
-                     hardwareThreadsAvailable, m_threads);
+    const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism();
+    if (m_threads > threadsAvailableToProcess) {
+        VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread "
+                     "count set to %u. This will likely cause significant slowdown.\n",
+                     threadsAvailableToProcess, m_threads);
     }
 }
 
diff --git a/include/verilated.h b/include/verilated.h
index 294ab94a1..fd784b48f 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -451,7 +451,7 @@ protected:
     // Implementation details
     const std::unique_ptr<VerilatedContextImpData> m_impdatap;
     // Number of threads to use for simulation (size of m_threadPool + 1 for main thread)
-    unsigned m_threads = std::thread::hardware_concurrency();
+    unsigned m_threads = VlOs::getProcessDefaultParallelism();
     // Number of threads in added models
     unsigned m_threadsInModels = 0;
     // The thread pool shared by all models added to this context
diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp
index acb8a11eb..e6ffa7e3f 100644
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() {
     for (auto& i : m_workers) delete i;
 }
 
-bool VlThreadPool::isNumactlRunning() {
-    // We assume if current thread is CPU-masked, then under numactl, otherwise not.
-    // This shows that numactl is visible through the affinity mask
-#if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
-    const unsigned num_cpus = std::thread::hardware_concurrency();
-    cpu_set_t cpuset;
-    CPU_ZERO(&cpuset);
-    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
-    if (rc != 0) return true;  // Error; assuming returning true is the least-damage option
-    for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
-        if (!CPU_ISSET(c, &cpuset)) return true;
-    }
-#endif
-    return false;
-}
-
 std::string VlThreadPool::numaAssign() {
 #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK)  // Linux-like pthreads
-    // If not under numactl, make a reasonable processor affinity selection
-    if (isNumactlRunning()) return "running under numactl";  // User presumably set affinity
+    // Get number of processor available to the current process
+    const unsigned num_proc = VlOs::getProcessAvailableParallelism();
+    if (!num_proc) return "Can't determine number of available threads";
+    // If fewer than hardware threads in the host, user presumably set affinity
+    if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";
+
+    // Make a reasonable processor affinity selection
     const int num_threads = static_cast<int>(m_workers.size());
-    const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
     if (num_threads < 2) return "too few threads";
-    if (num_threads > num_proc) return "too many threads";
+    if (static_cast<unsigned>(num_threads) > num_proc) return "too many threads";
 
     // Read CPU info.
     // Uncertain if any modern system has gaps in the processor id (Solaris
diff --git a/include/verilated_threads.h b/include/verilated_threads.h
index 643ebcf0b..f39015971 100644
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@@ -241,8 +241,6 @@ public:
 private:
     VL_UNCOPYABLE(VlThreadPool);
 
-    // cppcheck-suppress unusedPrivateFunction
-    static bool isNumactlRunning();
     std::string numaAssign();
 };
 
diff --git a/include/verilatedos.h b/include/verilatedos.h
index 2049eb0cf..7fd03f1e8 100644
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar,
 /// Return currently executing processor number; may do an OS call underneath so slow
 extern uint16_t getcpu() VL_MT_SAFE;
 
+/// Return number of processors available to the current process. This might be
+/// less than the number of logical processors in the machine, if a processor
+/// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot
+/// be determiend.
+extern unsigned getProcessAvailableParallelism() VL_MT_SAFE;
+
+/// Return getProcessAvailableParallelism if non-zero, otherwise the number of
+/// hardware threads in the host machine.
+extern unsigned getProcessDefaultParallelism() VL_MT_SAFE;
+
 /// Return memory usage in bytes, or 0 if unknown
 extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE;
 
diff --git a/include/verilatedos_c.h b/include/verilatedos_c.h
index 849419701..cf1588971 100644
--- a/include/verilatedos_c.h
+++ b/include/verilatedos_c.h
@@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE {
 #endif
 }
 
+//=============================================================================
+// Vlos::getProcessAvailableParallelism implementation
+
+unsigned getProcessAvailableParallelism() VL_MT_SAFE {
+#if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+    if (rc == 0) {
+        unsigned nCpus = 0;
+        for (int i = 0; i < CPU_SETSIZE; ++i) {
+            if (CPU_ISSET(i, &cpuset)) ++nCpus;
+        }
+        return nCpus;
+    }
+#endif
+    // Cannot determine
+    return 0;
+}
+
+//=============================================================================
+// Vlos::getProcessDefaultParallelism implementation
+
+unsigned getProcessDefaultParallelism() VL_MT_SAFE {
+    const unsigned n = getProcessAvailableParallelism();
+    return n ? n : std::thread::hardware_concurrency();
+}
+
 //=========================================================================
 // VlOs::memPeakUsageBytes implementation
 
diff --git a/nodist/install_test b/nodist/install_test
index e345f9474..67c0fe68e 100755
--- a/nodist/install_test
+++ b/nodist/install_test
@@ -3,7 +3,6 @@
 ######################################################################
 
 import argparse
-import multiprocessing
 import os
 import shutil
 import subprocess
@@ -93,7 +92,7 @@ def cleanenv():
 
 
 def calc_jobs():
-    return multiprocessing.cpu_count() + 1
+    return len(os.sched_getaffinity(0)) + 1
 
 
 def run(command):
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 46b1221c5..1b38796d0 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
                                                                               << "' was passed");
             val = 1;
         } else if (val == 0) {
-            val = std::thread::hardware_concurrency();
+            val = VlOs::getProcessDefaultParallelism();
         }
         m_buildJobs = val;
     });
@@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
                         << valp << "' was passed");
             val = 1;
         } else if (val == 0) {
-            val = std::thread::hardware_concurrency();
+            val = VlOs::getProcessDefaultParallelism();
         }
         m_verilateJobs = val;
     });
@@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
             int val = 0;
             if (i < argc && std::isdigit(argv[i][0])) {
                 val = std::atoi(argv[i]);  // Can't be negative due to isdigit above
-                if (val == 0) val = std::thread::hardware_concurrency();
+                if (val == 0) val = VlOs::getProcessDefaultParallelism();
                 ++i;
             }
             if (m_buildJobs == -1) m_buildJobs = val;
diff --git a/test_regress/driver.py b/test_regress/driver.py
index 13f247e74..b7baec517 100755
--- a/test_regress/driver.py
+++ b/test_regress/driver.py
@@ -2763,7 +2763,7 @@ def _calc_hashset() -> list:
 
 @lru_cache(maxsize=1)
 def max_procs() -> int:
-    procs = multiprocessing.cpu_count()
+    procs = len(os.sched_getaffinity(0))
     if procs < 2:
         print("driver.py: Python didn't find at least two CPUs")
     return procs
diff --git a/test_regress/t/t_a7_hier_block_cmake.py b/test_regress/t/t_a7_hier_block_cmake.py
index 508e8b999..fac3ee154 100755
--- a/test_regress/t/t_a7_hier_block_cmake.py
+++ b/test_regress/t/t_a7_hier_block_cmake.py
@@ -8,7 +8,7 @@
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 
 import vltest_bootstrap
-import multiprocessing
+import os
 
 # If a test fails, broken .cmake may disturb the next run
 test.clean_objs()
@@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log",
 test.run(logfile=test.obj_dir + "/build.log",
          cmd=[
              'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''),
-             '-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads)
+             '-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads)
          ])
 
 test.run(logfile=test.obj_dir + "/run.log",
diff --git a/test_regress/t/t_threads_crazy_context.py b/test_regress/t/t_threads_crazy_context.py
index 6ef9e273d..ae52d0316 100755
--- a/test_regress/t/t_threads_crazy_context.py
+++ b/test_regress/t/t_threads_crazy_context.py
@@ -19,7 +19,7 @@ test.execute()
 if test.vltmt:
     test.file_grep(
         test.run_log_filename,
-        r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
+        r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
     )
 
 test.passes()