Change default thread pool sizes to respect processor affinity (#6604)

Instead of using the number of processors in the host, use the number of
processors available to the process, respecting cpu affinity
assignments. Without pthreads, fall back and use the number of
processors in the host as before.

This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or
`numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g.
like in CI jobs)
This commit is contained in:
Geza Lore 2025-10-28 19:10:40 +01:00 committed by GitHub
parent 5642de432b
commit ffbb3229a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 72 additions and 45 deletions

View File

@ -169,9 +169,10 @@ Summary:
.. option:: --build-jobs <value> .. option:: --build-jobs <value>
Specify the level of parallelism for :vlopt:`--build`. If zero, uses the Specify the level of parallelism for :vlopt:`--build`. If zero, uses the
number of threads in the current hardware. Otherwise, the <value> must number of threads available to the process, which is the number of threads
be a positive integer specifying the maximum number of parallel build assigned by processor affinity (e.g. using `numactl`), or the number of
jobs. threads in the host hardware if unspecified. Otherwise, the <value> must be
a positive integer specifying the maximum number of parallel build jobs.
If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is
used. used.
@ -881,9 +882,10 @@ Summary:
of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets
:vlopt:`--output-groups` if isn't provided. :vlopt:`--output-groups` if isn't provided.
If zero, uses the number of threads in the current hardware. Otherwise, If zero, uses the number of threads available to the process, which is the
must be a positive integer specifying the maximum number of parallel number of threads assigned by processor affinity (e.g. using `numactl`), or
build jobs. the number of threads in the host hardware if unspecified. Otherwise, must
be a positive integer specifying the maximum number of parallel build jobs.
.. option:: --no-json-edit-nums .. option:: --no-json-edit-nums
@ -1831,7 +1833,9 @@ Summary:
.. option:: --verilate-jobs <value> .. option:: --verilate-jobs <value>
Specify the level of parallelism for the internal compilation steps of Specify the level of parallelism for the internal compilation steps of
Verilator. If zero, uses the number of threads in the current hardware. Verilator. If zero, uses the number of threads available to the process,
which is the number of threads assigned by processor affinity (e.g. using
`numactl`), or the number of threads in the host hardware if unspecified.
Otherwise, must be a positive integer specifying the maximum number of Otherwise, must be a positive integer specifying the maximum number of
parallel build jobs. parallel build jobs.

View File

@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) {
if (m_threads == n) return; // To avoid unnecessary warnings if (m_threads == n) return; // To avoid unnecessary warnings
m_threads = n; m_threads = n;
const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency(); const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism();
if (m_threads > hardwareThreadsAvailable) { if (m_threads > threadsAvailableToProcess) {
VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set " VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread "
"to %u. This will likely cause significant slowdown.\n", "count set to %u. This will likely cause significant slowdown.\n",
hardwareThreadsAvailable, m_threads); threadsAvailableToProcess, m_threads);
} }
} }

View File

@ -451,7 +451,7 @@ protected:
// Implementation details // Implementation details
const std::unique_ptr<VerilatedContextImpData> m_impdatap; const std::unique_ptr<VerilatedContextImpData> m_impdatap;
// Number of threads to use for simulation (size of m_threadPool + 1 for main thread) // Number of threads to use for simulation (size of m_threadPool + 1 for main thread)
unsigned m_threads = std::thread::hardware_concurrency(); unsigned m_threads = VlOs::getProcessDefaultParallelism();
// Number of threads in added models // Number of threads in added models
unsigned m_threadsInModels = 0; unsigned m_threadsInModels = 0;
// The thread pool shared by all models added to this context // The thread pool shared by all models added to this context

View File

@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() {
for (auto& i : m_workers) delete i; for (auto& i : m_workers) delete i;
} }
bool VlThreadPool::isNumactlRunning() {
// We assume if current thread is CPU-masked, then under numactl, otherwise not.
// This shows that numactl is visible through the affinity mask
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
const unsigned num_cpus = std::thread::hardware_concurrency();
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
if (rc != 0) return true; // Error; assuming returning true is the least-damage option
for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
if (!CPU_ISSET(c, &cpuset)) return true;
}
#endif
return false;
}
std::string VlThreadPool::numaAssign() { std::string VlThreadPool::numaAssign() {
#if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads
// If not under numactl, make a reasonable processor affinity selection // Get number of processor available to the current process
if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity const unsigned num_proc = VlOs::getProcessAvailableParallelism();
if (!num_proc) return "Can't determine number of available threads";
// If fewer than hardware threads in the host, user presumably set affinity
if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";
// Make a reasonable processor affinity selection
const int num_threads = static_cast<int>(m_workers.size()); const int num_threads = static_cast<int>(m_workers.size());
const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
if (num_threads < 2) return "too few threads"; if (num_threads < 2) return "too few threads";
if (num_threads > num_proc) return "too many threads"; if (static_cast<unsigned>(num_threads) > num_proc) return "too many threads";
// Read CPU info. // Read CPU info.
// Uncertain if any modern system has gaps in the processor id (Solaris // Uncertain if any modern system has gaps in the processor id (Solaris

View File

@ -241,8 +241,6 @@ public:
private: private:
VL_UNCOPYABLE(VlThreadPool); VL_UNCOPYABLE(VlThreadPool);
// cppcheck-suppress unusedPrivateFunction
static bool isNumactlRunning();
std::string numaAssign(); std::string numaAssign();
}; };

View File

@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar,
/// Return currently executing processor number; may do an OS call underneath so slow /// Return currently executing processor number; may do an OS call underneath so slow
extern uint16_t getcpu() VL_MT_SAFE; extern uint16_t getcpu() VL_MT_SAFE;
/// Return number of processors available to the current process. This might be
/// less than the number of logical processors in the machine, if a processor
/// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot
/// be determiend.
extern unsigned getProcessAvailableParallelism() VL_MT_SAFE;
/// Return getProcessAvailableParallelism if non-zero, otherwise the number of
/// hardware threads in the host machine.
extern unsigned getProcessDefaultParallelism() VL_MT_SAFE;
/// Return memory usage in bytes, or 0 if unknown /// Return memory usage in bytes, or 0 if unknown
extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE; extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE;

View File

@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE {
#endif #endif
} }
//=============================================================================
// Vlos::getProcessAvailableParallelism implementation
unsigned getProcessAvailableParallelism() VL_MT_SAFE {
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
if (rc == 0) {
unsigned nCpus = 0;
for (int i = 0; i < CPU_SETSIZE; ++i) {
if (CPU_ISSET(i, &cpuset)) ++nCpus;
}
return nCpus;
}
#endif
// Cannot determine
return 0;
}
//=============================================================================
// Vlos::getProcessDefaultParallelism implementation
unsigned getProcessDefaultParallelism() VL_MT_SAFE {
const unsigned n = getProcessAvailableParallelism();
return n ? n : std::thread::hardware_concurrency();
}
//========================================================================= //=========================================================================
// VlOs::memPeakUsageBytes implementation // VlOs::memPeakUsageBytes implementation

View File

@ -3,7 +3,6 @@
###################################################################### ######################################################################
import argparse import argparse
import multiprocessing
import os import os
import shutil import shutil
import subprocess import subprocess
@ -93,7 +92,7 @@ def cleanenv():
def calc_jobs(): def calc_jobs():
return multiprocessing.cpu_count() + 1 return len(os.sched_getaffinity(0)) + 1
def run(command): def run(command):

View File

@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
<< "' was passed"); << "' was passed");
val = 1; val = 1;
} else if (val == 0) { } else if (val == 0) {
val = std::thread::hardware_concurrency(); val = VlOs::getProcessDefaultParallelism();
} }
m_buildJobs = val; m_buildJobs = val;
}); });
@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
<< valp << "' was passed"); << valp << "' was passed");
val = 1; val = 1;
} else if (val == 0) { } else if (val == 0) {
val = std::thread::hardware_concurrency(); val = VlOs::getProcessDefaultParallelism();
} }
m_verilateJobs = val; m_verilateJobs = val;
}); });
@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
int val = 0; int val = 0;
if (i < argc && std::isdigit(argv[i][0])) { if (i < argc && std::isdigit(argv[i][0])) {
val = std::atoi(argv[i]); // Can't be negative due to isdigit above val = std::atoi(argv[i]); // Can't be negative due to isdigit above
if (val == 0) val = std::thread::hardware_concurrency(); if (val == 0) val = VlOs::getProcessDefaultParallelism();
++i; ++i;
} }
if (m_buildJobs == -1) m_buildJobs = val; if (m_buildJobs == -1) m_buildJobs = val;

View File

@ -2763,7 +2763,7 @@ def _calc_hashset() -> list:
@lru_cache(maxsize=1) @lru_cache(maxsize=1)
def max_procs() -> int: def max_procs() -> int:
procs = multiprocessing.cpu_count() procs = len(os.sched_getaffinity(0))
if procs < 2: if procs < 2:
print("driver.py: Python didn't find at least two CPUs") print("driver.py: Python didn't find at least two CPUs")
return procs return procs

View File

@ -8,7 +8,7 @@
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap import vltest_bootstrap
import multiprocessing import os
# If a test fails, broken .cmake may disturb the next run # If a test fails, broken .cmake may disturb the next run
test.clean_objs() test.clean_objs()
@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log",
test.run(logfile=test.obj_dir + "/build.log", test.run(logfile=test.obj_dir + "/build.log",
cmd=[ cmd=[
'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''), 'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''),
'-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads) '-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads)
]) ])
test.run(logfile=test.obj_dir + "/run.log", test.run(logfile=test.obj_dir + "/run.log",

View File

@ -19,7 +19,7 @@ test.execute()
if test.vltmt: if test.vltmt:
test.file_grep( test.file_grep(
test.run_log_filename, test.run_log_filename,
r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.' r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
) )
test.passes() test.passes()