Change default thread pool sizes to respect processor affinity (#6604)

Instead of using the number of processors in the host, use the number of processors available to the process, respecting cpu affinity assignments. Without pthreads, fall back and use the number of processors in the host as before. This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or `numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g. like in CI jobs)
2025-10-28 19:10:40 +01:00 · 2025-10-28 19:10:40 +01:00 · ffbb3229a8
parent 5642de432b
commit ffbb3229a8
12 changed files with 72 additions and 45 deletions
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@ -169,9 +169,10 @@ Summary:
 .. option:: --build-jobs <value>
   Specify the level of parallelism for :vlopt:`--build`.  If zero, uses the
-   number of threads in the current hardware. Otherwise, the <value> must
+   number of threads available to the process, which is the number of threads
-   be a positive integer specifying the maximum number of parallel build
+   assigned by processor affinity (e.g. using `numactl`), or the number of
-   jobs.
+   threads in the host hardware if unspecified.  Otherwise, the <value> must be
   a positive integer specifying the maximum number of parallel build jobs.
   If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is
   used.
@ -881,9 +882,10 @@ Summary:
   of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets
   :vlopt:`--output-groups` if isn't provided.
-   If zero, uses the number of threads in the current hardware. Otherwise,
+   If zero, uses the number of threads available to the process, which is the
-   must be a positive integer specifying the maximum number of parallel
+   number of threads assigned by processor affinity (e.g. using `numactl`), or
-   build jobs.
+   the number of threads in the host hardware if unspecified.  Otherwise, must
   be a positive integer specifying the maximum number of parallel build jobs.
 .. option:: --no-json-edit-nums
@ -1831,7 +1833,9 @@ Summary:
 .. option:: --verilate-jobs <value>
   Specify the level of parallelism for the internal compilation steps of
-   Verilator. If zero, uses the number of threads in the current hardware.
+   Verilator.  If zero, uses the number of threads available to the process,
   which is the number of threads assigned by processor affinity (e.g. using
   `numactl`), or the number of threads in the host hardware if unspecified.
   Otherwise, must be a positive integer specifying the maximum number of
   parallel build jobs.
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) {
    if (m_threads == n) return;  // To avoid unnecessary warnings
    m_threads = n;
-    const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency();
+    const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism();
-    if (m_threads > hardwareThreadsAvailable) {
+    if (m_threads > threadsAvailableToProcess) {
-        VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set "
+        VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread "
-                     "to %u. This will likely cause significant slowdown.\n",
+                     "count set to %u. This will likely cause significant slowdown.\n",
-                     hardwareThreadsAvailable, m_threads);
+                     threadsAvailableToProcess, m_threads);
    }
 }
--- a/include/verilated.h
+++ b/include/verilated.h
@ -451,7 +451,7 @@ protected:
    // Implementation details
    const std::unique_ptr<VerilatedContextImpData> m_impdatap;
    // Number of threads to use for simulation (size of m_threadPool + 1 for main thread)
-    unsigned m_threads = std::thread::hardware_concurrency();
+    unsigned m_threads = VlOs::getProcessDefaultParallelism();
    // Number of threads in added models
    unsigned m_threadsInModels = 0;
    // The thread pool shared by all models added to this context
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() {
    for (auto& i : m_workers) delete i;
 }
 bool VlThreadPool::isNumactlRunning() {
    // We assume if current thread is CPU-masked, then under numactl, otherwise not.
    // This shows that numactl is visible through the affinity mask
 #if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
    const unsigned num_cpus = std::thread::hardware_concurrency();
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    if (rc != 0) return true;  // Error; assuming returning true is the least-damage option
    for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
        if (!CPU_ISSET(c, &cpuset)) return true;
    }
 #endif
    return false;
 }
 std::string VlThreadPool::numaAssign() {
 #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK)  // Linux-like pthreads
-    // If not under numactl, make a reasonable processor affinity selection
+    // Get number of processor available to the current process
-    if (isNumactlRunning()) return "running under numactl";  // User presumably set affinity
+    const unsigned num_proc = VlOs::getProcessAvailableParallelism();
    if (!num_proc) return "Can't determine number of available threads";
    // If fewer than hardware threads in the host, user presumably set affinity
    if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";
    // Make a reasonable processor affinity selection
    const int num_threads = static_cast<int>(m_workers.size());
    const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
    if (num_threads < 2) return "too few threads";
-    if (num_threads > num_proc) return "too many threads";
+    if (static_cast<unsigned>(num_threads) > num_proc) return "too many threads";
    // Read CPU info.
    // Uncertain if any modern system has gaps in the processor id (Solaris
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -241,8 +241,6 @@ public:
 private:
    VL_UNCOPYABLE(VlThreadPool);
    // cppcheck-suppress unusedPrivateFunction
    static bool isNumactlRunning();
    std::string numaAssign();
 };
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar,
 /// Return currently executing processor number; may do an OS call underneath so slow
 extern uint16_t getcpu() VL_MT_SAFE;
 /// Return number of processors available to the current process. This might be
 /// less than the number of logical processors in the machine, if a processor
 /// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot
 /// be determiend.
 extern unsigned getProcessAvailableParallelism() VL_MT_SAFE;
 /// Return getProcessAvailableParallelism if non-zero, otherwise the number of
 /// hardware threads in the host machine.
 extern unsigned getProcessDefaultParallelism() VL_MT_SAFE;
 /// Return memory usage in bytes, or 0 if unknown
 extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE;
--- a/include/verilatedos_c.h
+++ b/include/verilatedos_c.h
@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE {
 #endif
 }
 //=============================================================================
 // Vlos::getProcessAvailableParallelism implementation
 unsigned getProcessAvailableParallelism() VL_MT_SAFE {
 #if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
    if (rc == 0) {
        unsigned nCpus = 0;
        for (int i = 0; i < CPU_SETSIZE; ++i) {
            if (CPU_ISSET(i, &cpuset)) ++nCpus;
        }
        return nCpus;
    }
 #endif
    // Cannot determine
    return 0;
 }
 //=============================================================================
 // Vlos::getProcessDefaultParallelism implementation
 unsigned getProcessDefaultParallelism() VL_MT_SAFE {
    const unsigned n = getProcessAvailableParallelism();
    return n ? n : std::thread::hardware_concurrency();
 }
 //=========================================================================
 // VlOs::memPeakUsageBytes implementation
--- a/nodist/install_test
+++ b/nodist/install_test
@ -3,7 +3,6 @@
 ######################################################################
 import argparse
 import multiprocessing
 import os
 import shutil
 import subprocess
@ -93,7 +92,7 @@ def cleanenv():
 def calc_jobs():
-    return multiprocessing.cpu_count() + 1
+    return len(os.sched_getaffinity(0)) + 1
 def run(command):
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
                                                                              << "' was passed");
            val = 1;
        } else if (val == 0) {
-            val = std::thread::hardware_concurrency();
+            val = VlOs::getProcessDefaultParallelism();
        }
        m_buildJobs = val;
    });
@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
                        << valp << "' was passed");
            val = 1;
        } else if (val == 0) {
-            val = std::thread::hardware_concurrency();
+            val = VlOs::getProcessDefaultParallelism();
        }
        m_verilateJobs = val;
    });
@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
            int val = 0;
            if (i < argc && std::isdigit(argv[i][0])) {
                val = std::atoi(argv[i]);  // Can't be negative due to isdigit above
-                if (val == 0) val = std::thread::hardware_concurrency();
+                if (val == 0) val = VlOs::getProcessDefaultParallelism();
                ++i;
            }
            if (m_buildJobs == -1) m_buildJobs = val;
--- a/test_regress/driver.py
+++ b/test_regress/driver.py
@ -2763,7 +2763,7 @@ def _calc_hashset() -> list:
@lru_cache(maxsize=1)
 def max_procs() -> int:
-    procs = multiprocessing.cpu_count()
+    procs = len(os.sched_getaffinity(0))
    if procs < 2:
        print("driver.py: Python didn't find at least two CPUs")
    return procs
--- a/test_regress/t/t_a7_hier_block_cmake.py
+++ b/test_regress/t/t_a7_hier_block_cmake.py
@ -8,7 +8,7 @@
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 import vltest_bootstrap
-import multiprocessing
+import os
 # If a test fails, broken .cmake may disturb the next run
 test.clean_objs()
@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log",
 test.run(logfile=test.obj_dir + "/build.log",
         cmd=[
             'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''),
-             '-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads)
+             '-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads)
         ])
 test.run(logfile=test.obj_dir + "/run.log",
--- a/test_regress/t/t_threads_crazy_context.py
+++ b/test_regress/t/t_threads_crazy_context.py
@ -19,7 +19,7 @@ test.execute()
 if test.vltmt:
    test.file_grep(
        test.run_log_filename,
-        r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
+        r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
    )
 test.passes()