Change default thread pool sizes to respect processor affinity (#6604)

Instead of using the number of processors in the host, use the number of
processors available to the process, respecting cpu affinity
assignments. Without pthreads, fall back and use the number of
processors in the host as before.

This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or
`numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g.
like in CI jobs)
This commit is contained in:
Geza Lore 2025-10-28 19:10:40 +01:00 committed by GitHub
parent 5642de432b
commit ffbb3229a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 72 additions and 45 deletions

View File

@ -168,10 +168,11 @@ Summary:
.. option:: --build-jobs <value>
Specify the level of parallelism for :vlopt:`--build`. If zero, uses the
number of threads in the current hardware. Otherwise, the <value> must
be a positive integer specifying the maximum number of parallel build
jobs.
Specify the level of parallelism for :vlopt:`--build`. If zero, uses the
number of threads available to the process, which is the number of threads
assigned by processor affinity (e.g. using `numactl`), or the number of
threads in the host hardware if unspecified. Otherwise, the <value> must be
a positive integer specifying the maximum number of parallel build jobs.
If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is
used.
@ -881,9 +882,10 @@ Summary:
of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets
:vlopt:`--output-groups` if isn't provided.
If zero, uses the number of threads in the current hardware. Otherwise,
must be a positive integer specifying the maximum number of parallel
build jobs.
If zero, uses the number of threads available to the process, which is the
number of threads assigned by processor affinity (e.g. using `numactl`), or
the number of threads in the host hardware if unspecified. Otherwise, must
be a positive integer specifying the maximum number of parallel build jobs.
.. option:: --no-json-edit-nums
@ -1831,7 +1833,9 @@ Summary:
.. option:: --verilate-jobs <value>
Specify the level of parallelism for the internal compilation steps of
Verilator. If zero, uses the number of threads in the current hardware.
Verilator. If zero, uses the number of threads available to the process,
which is the number of threads assigned by processor affinity (e.g. using
`numactl`), or the number of threads in the host hardware if unspecified.
Otherwise, must be a positive integer specifying the maximum number of
parallel build jobs.

View File

@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) {
if (m_threads == n) return; // To avoid unnecessary warnings
m_threads = n;
const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency();
if (m_threads > hardwareThreadsAvailable) {
VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set "
"to %u. This will likely cause significant slowdown.\n",
hardwareThreadsAvailable, m_threads);
const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism();
if (m_threads > threadsAvailableToProcess) {
VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread "
"count set to %u. This will likely cause significant slowdown.\n",
threadsAvailableToProcess, m_threads);
}
}

View File

@ -451,7 +451,7 @@ protected:
// Implementation details
const std::unique_ptr<VerilatedContextImpData> m_impdatap;
// Number of threads to use for simulation (size of m_threadPool + 1 for main thread)
unsigned m_threads = std::thread::hardware_concurrency();
unsigned m_threads = VlOs::getProcessDefaultParallelism();
// Number of threads in added models
unsigned m_threadsInModels = 0;
// The thread pool shared by all models added to this context

View File

@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() {
for (auto& i : m_workers) delete i;
}
bool VlThreadPool::isNumactlRunning() {
// We assume if current thread is CPU-masked, then under numactl, otherwise not.
// This shows that numactl is visible through the affinity mask
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
const unsigned num_cpus = std::thread::hardware_concurrency();
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
if (rc != 0) return true; // Error; assuming returning true is the least-damage option
for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
if (!CPU_ISSET(c, &cpuset)) return true;
}
#endif
return false;
}
std::string VlThreadPool::numaAssign() {
#if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads
// If not under numactl, make a reasonable processor affinity selection
if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity
// Get number of processor available to the current process
const unsigned num_proc = VlOs::getProcessAvailableParallelism();
if (!num_proc) return "Can't determine number of available threads";
// If fewer than hardware threads in the host, user presumably set affinity
if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";
// Make a reasonable processor affinity selection
const int num_threads = static_cast<int>(m_workers.size());
const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
if (num_threads < 2) return "too few threads";
if (num_threads > num_proc) return "too many threads";
if (static_cast<unsigned>(num_threads) > num_proc) return "too many threads";
// Read CPU info.
// Uncertain if any modern system has gaps in the processor id (Solaris

View File

@ -241,8 +241,6 @@ public:
private:
VL_UNCOPYABLE(VlThreadPool);
// cppcheck-suppress unusedPrivateFunction
static bool isNumactlRunning();
std::string numaAssign();
};

View File

@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar,
/// Return currently executing processor number; may do an OS call underneath so slow
extern uint16_t getcpu() VL_MT_SAFE;
/// Return number of processors available to the current process. This might be
/// less than the number of logical processors in the machine, if a processor
/// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot
/// be determiend.
extern unsigned getProcessAvailableParallelism() VL_MT_SAFE;
/// Return getProcessAvailableParallelism if non-zero, otherwise the number of
/// hardware threads in the host machine.
extern unsigned getProcessDefaultParallelism() VL_MT_SAFE;
/// Return memory usage in bytes, or 0 if unknown
extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE;

View File

@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE {
#endif
}
//=============================================================================
// Vlos::getProcessAvailableParallelism implementation
unsigned getProcessAvailableParallelism() VL_MT_SAFE {
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
if (rc == 0) {
unsigned nCpus = 0;
for (int i = 0; i < CPU_SETSIZE; ++i) {
if (CPU_ISSET(i, &cpuset)) ++nCpus;
}
return nCpus;
}
#endif
// Cannot determine
return 0;
}
//=============================================================================
// Vlos::getProcessDefaultParallelism implementation
unsigned getProcessDefaultParallelism() VL_MT_SAFE {
const unsigned n = getProcessAvailableParallelism();
return n ? n : std::thread::hardware_concurrency();
}
//=========================================================================
// VlOs::memPeakUsageBytes implementation

View File

@ -3,7 +3,6 @@
######################################################################
import argparse
import multiprocessing
import os
import shutil
import subprocess
@ -93,7 +92,7 @@ def cleanenv():
def calc_jobs():
return multiprocessing.cpu_count() + 1
return len(os.sched_getaffinity(0)) + 1
def run(command):

View File

@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
<< "' was passed");
val = 1;
} else if (val == 0) {
val = std::thread::hardware_concurrency();
val = VlOs::getProcessDefaultParallelism();
}
m_buildJobs = val;
});
@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
<< valp << "' was passed");
val = 1;
} else if (val == 0) {
val = std::thread::hardware_concurrency();
val = VlOs::getProcessDefaultParallelism();
}
m_verilateJobs = val;
});
@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
int val = 0;
if (i < argc && std::isdigit(argv[i][0])) {
val = std::atoi(argv[i]); // Can't be negative due to isdigit above
if (val == 0) val = std::thread::hardware_concurrency();
if (val == 0) val = VlOs::getProcessDefaultParallelism();
++i;
}
if (m_buildJobs == -1) m_buildJobs = val;

View File

@ -2763,7 +2763,7 @@ def _calc_hashset() -> list:
@lru_cache(maxsize=1)
def max_procs() -> int:
procs = multiprocessing.cpu_count()
procs = len(os.sched_getaffinity(0))
if procs < 2:
print("driver.py: Python didn't find at least two CPUs")
return procs

View File

@ -8,7 +8,7 @@
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
import multiprocessing
import os
# If a test fails, broken .cmake may disturb the next run
test.clean_objs()
@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log",
test.run(logfile=test.obj_dir + "/build.log",
cmd=[
'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''),
'-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads)
'-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads)
])
test.run(logfile=test.obj_dir + "/run.log",

View File

@ -19,7 +19,7 @@ test.execute()
if test.vltmt:
test.file_grep(
test.run_log_filename,
r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.'
)
test.passes()