From ffbb3229a815ebba5e5dd2d3c5c050702a5951eb Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Tue, 28 Oct 2025 19:10:40 +0100 Subject: [PATCH] Change default thread pool sizes to respect processor affinity (#6604) Instead of using the number of processors in the host, use the number of processors available to the process, respecting cpu affinity assignments. Without pthreads, fall back and use the number of processors in the host as before. This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or `numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g. like in CI jobs) --- docs/guide/exe_verilator.rst | 20 +++++++++------- include/verilated.cpp | 10 ++++---- include/verilated.h | 2 +- include/verilated_threads.cpp | 28 +++++++---------------- include/verilated_threads.h | 2 -- include/verilatedos.h | 10 ++++++++ include/verilatedos_c.h | 28 +++++++++++++++++++++++ nodist/install_test | 3 +-- src/V3Options.cpp | 6 ++--- test_regress/driver.py | 2 +- test_regress/t/t_a7_hier_block_cmake.py | 4 ++-- test_regress/t/t_threads_crazy_context.py | 2 +- 12 files changed, 72 insertions(+), 45 deletions(-) diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 22ee03181..10dfcd611 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -168,10 +168,11 @@ Summary: .. option:: --build-jobs - Specify the level of parallelism for :vlopt:`--build`. If zero, uses the - number of threads in the current hardware. Otherwise, the must - be a positive integer specifying the maximum number of parallel build - jobs. + Specify the level of parallelism for :vlopt:`--build`. If zero, uses the + number of threads available to the process, which is the number of threads + assigned by processor affinity (e.g. using `numactl`), or the number of + threads in the host hardware if unspecified. Otherwise, the must be + a positive integer specifying the maximum number of parallel build jobs. If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is used. @@ -881,9 +882,10 @@ Summary: of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets :vlopt:`--output-groups` if isn't provided. - If zero, uses the number of threads in the current hardware. Otherwise, - must be a positive integer specifying the maximum number of parallel - build jobs. + If zero, uses the number of threads available to the process, which is the + number of threads assigned by processor affinity (e.g. using `numactl`), or + the number of threads in the host hardware if unspecified. Otherwise, must + be a positive integer specifying the maximum number of parallel build jobs. .. option:: --no-json-edit-nums @@ -1831,7 +1833,9 @@ Summary: .. option:: --verilate-jobs Specify the level of parallelism for the internal compilation steps of - Verilator. If zero, uses the number of threads in the current hardware. + Verilator. If zero, uses the number of threads available to the process, + which is the number of threads assigned by processor affinity (e.g. using + `numactl`), or the number of threads in the host hardware if unspecified. Otherwise, must be a positive integer specifying the maximum number of parallel build jobs. diff --git a/include/verilated.cpp b/include/verilated.cpp index d2bf2d93c..185c01838 100644 --- a/include/verilated.cpp +++ b/include/verilated.cpp @@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) { if (m_threads == n) return; // To avoid unnecessary warnings m_threads = n; - const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency(); - if (m_threads > hardwareThreadsAvailable) { - VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set " - "to %u. This will likely cause significant slowdown.\n", - hardwareThreadsAvailable, m_threads); + const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism(); + if (m_threads > threadsAvailableToProcess) { + VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread " + "count set to %u. This will likely cause significant slowdown.\n", + threadsAvailableToProcess, m_threads); } } diff --git a/include/verilated.h b/include/verilated.h index 294ab94a1..fd784b48f 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -451,7 +451,7 @@ protected: // Implementation details const std::unique_ptr m_impdatap; // Number of threads to use for simulation (size of m_threadPool + 1 for main thread) - unsigned m_threads = std::thread::hardware_concurrency(); + unsigned m_threads = VlOs::getProcessDefaultParallelism(); // Number of threads in added models unsigned m_threadsInModels = 0; // The thread pool shared by all models added to this context diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index acb8a11eb..e6ffa7e3f 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() { for (auto& i : m_workers) delete i; } -bool VlThreadPool::isNumactlRunning() { - // We assume if current thread is CPU-masked, then under numactl, otherwise not. - // This shows that numactl is visible through the affinity mask -#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc - const unsigned num_cpus = std::thread::hardware_concurrency(); - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - if (rc != 0) return true; // Error; assuming returning true is the least-damage option - for (unsigned c = 0; c < std::min(num_cpus, static_cast(CPU_SETSIZE)); ++c) { - if (!CPU_ISSET(c, &cpuset)) return true; - } -#endif - return false; -} - std::string VlThreadPool::numaAssign() { #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads - // If not under numactl, make a reasonable processor affinity selection - if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity + // Get number of processor available to the current process + const unsigned num_proc = VlOs::getProcessAvailableParallelism(); + if (!num_proc) return "Can't determine number of available threads"; + // If fewer than hardware threads in the host, user presumably set affinity + if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set"; + + // Make a reasonable processor affinity selection const int num_threads = static_cast(m_workers.size()); - const int num_proc = static_cast(std::thread::hardware_concurrency()); if (num_threads < 2) return "too few threads"; - if (num_threads > num_proc) return "too many threads"; + if (static_cast(num_threads) > num_proc) return "too many threads"; // Read CPU info. // Uncertain if any modern system has gaps in the processor id (Solaris diff --git a/include/verilated_threads.h b/include/verilated_threads.h index 643ebcf0b..f39015971 100644 --- a/include/verilated_threads.h +++ b/include/verilated_threads.h @@ -241,8 +241,6 @@ public: private: VL_UNCOPYABLE(VlThreadPool); - // cppcheck-suppress unusedPrivateFunction - static bool isNumactlRunning(); std::string numaAssign(); }; diff --git a/include/verilatedos.h b/include/verilatedos.h index 2049eb0cf..7fd03f1e8 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar, /// Return currently executing processor number; may do an OS call underneath so slow extern uint16_t getcpu() VL_MT_SAFE; +/// Return number of processors available to the current process. This might be +/// less than the number of logical processors in the machine, if a processor +/// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot +/// be determiend. +extern unsigned getProcessAvailableParallelism() VL_MT_SAFE; + +/// Return getProcessAvailableParallelism if non-zero, otherwise the number of +/// hardware threads in the host machine. +extern unsigned getProcessDefaultParallelism() VL_MT_SAFE; + /// Return memory usage in bytes, or 0 if unknown extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE; diff --git a/include/verilatedos_c.h b/include/verilatedos_c.h index 849419701..cf1588971 100644 --- a/include/verilatedos_c.h +++ b/include/verilatedos_c.h @@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE { #endif } +//============================================================================= +// Vlos::getProcessAvailableParallelism implementation + +unsigned getProcessAvailableParallelism() VL_MT_SAFE { +#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (rc == 0) { + unsigned nCpus = 0; + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &cpuset)) ++nCpus; + } + return nCpus; + } +#endif + // Cannot determine + return 0; +} + +//============================================================================= +// Vlos::getProcessDefaultParallelism implementation + +unsigned getProcessDefaultParallelism() VL_MT_SAFE { + const unsigned n = getProcessAvailableParallelism(); + return n ? n : std::thread::hardware_concurrency(); +} + //========================================================================= // VlOs::memPeakUsageBytes implementation diff --git a/nodist/install_test b/nodist/install_test index e345f9474..67c0fe68e 100755 --- a/nodist/install_test +++ b/nodist/install_test @@ -3,7 +3,6 @@ ###################################################################### import argparse -import multiprocessing import os import shutil import subprocess @@ -93,7 +92,7 @@ def cleanenv(): def calc_jobs(): - return multiprocessing.cpu_count() + 1 + return len(os.sched_getaffinity(0)) + 1 def run(command): diff --git a/src/V3Options.cpp b/src/V3Options.cpp index 46b1221c5..1b38796d0 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, << "' was passed"); val = 1; } else if (val == 0) { - val = std::thread::hardware_concurrency(); + val = VlOs::getProcessDefaultParallelism(); } m_buildJobs = val; }); @@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, << valp << "' was passed"); val = 1; } else if (val == 0) { - val = std::thread::hardware_concurrency(); + val = VlOs::getProcessDefaultParallelism(); } m_verilateJobs = val; }); @@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, int val = 0; if (i < argc && std::isdigit(argv[i][0])) { val = std::atoi(argv[i]); // Can't be negative due to isdigit above - if (val == 0) val = std::thread::hardware_concurrency(); + if (val == 0) val = VlOs::getProcessDefaultParallelism(); ++i; } if (m_buildJobs == -1) m_buildJobs = val; diff --git a/test_regress/driver.py b/test_regress/driver.py index 13f247e74..b7baec517 100755 --- a/test_regress/driver.py +++ b/test_regress/driver.py @@ -2763,7 +2763,7 @@ def _calc_hashset() -> list: @lru_cache(maxsize=1) def max_procs() -> int: - procs = multiprocessing.cpu_count() + procs = len(os.sched_getaffinity(0)) if procs < 2: print("driver.py: Python didn't find at least two CPUs") return procs diff --git a/test_regress/t/t_a7_hier_block_cmake.py b/test_regress/t/t_a7_hier_block_cmake.py index 508e8b999..fac3ee154 100755 --- a/test_regress/t/t_a7_hier_block_cmake.py +++ b/test_regress/t/t_a7_hier_block_cmake.py @@ -8,7 +8,7 @@ # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 import vltest_bootstrap -import multiprocessing +import os # If a test fails, broken .cmake may disturb the next run test.clean_objs() @@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log", test.run(logfile=test.obj_dir + "/build.log", cmd=[ 'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''), - '-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads) + '-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads) ]) test.run(logfile=test.obj_dir + "/run.log", diff --git a/test_regress/t/t_threads_crazy_context.py b/test_regress/t/t_threads_crazy_context.py index 6ef9e273d..ae52d0316 100755 --- a/test_regress/t/t_threads_crazy_context.py +++ b/test_regress/t/t_threads_crazy_context.py @@ -19,7 +19,7 @@ test.execute() if test.vltmt: test.file_grep( test.run_log_filename, - r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.' + r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.' ) test.passes()