diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 22ee03181..10dfcd611 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -168,10 +168,11 @@ Summary: .. option:: --build-jobs - Specify the level of parallelism for :vlopt:`--build`. If zero, uses the - number of threads in the current hardware. Otherwise, the must - be a positive integer specifying the maximum number of parallel build - jobs. + Specify the level of parallelism for :vlopt:`--build`. If zero, uses the + number of threads available to the process, which is the number of threads + assigned by processor affinity (e.g. using `numactl`), or the number of + threads in the host hardware if unspecified. Otherwise, the must be + a positive integer specifying the maximum number of parallel build jobs. If not provided, and :vlopt:`-j` is provided, the :vlopt:`-j` value is used. @@ -881,9 +882,10 @@ Summary: of Verilator if :vlopt:`--verilate-jobs` isn't provided. Also sets :vlopt:`--output-groups` if isn't provided. - If zero, uses the number of threads in the current hardware. Otherwise, - must be a positive integer specifying the maximum number of parallel - build jobs. + If zero, uses the number of threads available to the process, which is the + number of threads assigned by processor affinity (e.g. using `numactl`), or + the number of threads in the host hardware if unspecified. Otherwise, must + be a positive integer specifying the maximum number of parallel build jobs. .. option:: --no-json-edit-nums @@ -1831,7 +1833,9 @@ Summary: .. option:: --verilate-jobs Specify the level of parallelism for the internal compilation steps of - Verilator. If zero, uses the number of threads in the current hardware. + Verilator. If zero, uses the number of threads available to the process, + which is the number of threads assigned by processor affinity (e.g. using + `numactl`), or the number of threads in the host hardware if unspecified. Otherwise, must be a positive integer specifying the maximum number of parallel build jobs. diff --git a/include/verilated.cpp b/include/verilated.cpp index d2bf2d93c..185c01838 100644 --- a/include/verilated.cpp +++ b/include/verilated.cpp @@ -2807,11 +2807,11 @@ void VerilatedContext::threads(unsigned n) { if (m_threads == n) return; // To avoid unnecessary warnings m_threads = n; - const unsigned hardwareThreadsAvailable = std::thread::hardware_concurrency(); - if (m_threads > hardwareThreadsAvailable) { - VL_PRINTF_MT("%%Warning: System has %u hardware threads but simulation thread count set " - "to %u. This will likely cause significant slowdown.\n", - hardwareThreadsAvailable, m_threads); + const unsigned threadsAvailableToProcess = VlOs::getProcessDefaultParallelism(); + if (m_threads > threadsAvailableToProcess) { + VL_PRINTF_MT("%%Warning: Process has %u hardware threads available, but simulation thread " + "count set to %u. This will likely cause significant slowdown.\n", + threadsAvailableToProcess, m_threads); } } diff --git a/include/verilated.h b/include/verilated.h index 294ab94a1..fd784b48f 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -451,7 +451,7 @@ protected: // Implementation details const std::unique_ptr m_impdatap; // Number of threads to use for simulation (size of m_threadPool + 1 for main thread) - unsigned m_threads = std::thread::hardware_concurrency(); + unsigned m_threads = VlOs::getProcessDefaultParallelism(); // Number of threads in added models unsigned m_threadsInModels = 0; // The thread pool shared by all models added to this context diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index acb8a11eb..e6ffa7e3f 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -118,30 +118,18 @@ VlThreadPool::~VlThreadPool() { for (auto& i : m_workers) delete i; } -bool VlThreadPool::isNumactlRunning() { - // We assume if current thread is CPU-masked, then under numactl, otherwise not. - // This shows that numactl is visible through the affinity mask -#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc - const unsigned num_cpus = std::thread::hardware_concurrency(); - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); - if (rc != 0) return true; // Error; assuming returning true is the least-damage option - for (unsigned c = 0; c < std::min(num_cpus, static_cast(CPU_SETSIZE)); ++c) { - if (!CPU_ISSET(c, &cpuset)) return true; - } -#endif - return false; -} - std::string VlThreadPool::numaAssign() { #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads - // If not under numactl, make a reasonable processor affinity selection - if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity + // Get number of processor available to the current process + const unsigned num_proc = VlOs::getProcessAvailableParallelism(); + if (!num_proc) return "Can't determine number of available threads"; + // If fewer than hardware threads in the host, user presumably set affinity + if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set"; + + // Make a reasonable processor affinity selection const int num_threads = static_cast(m_workers.size()); - const int num_proc = static_cast(std::thread::hardware_concurrency()); if (num_threads < 2) return "too few threads"; - if (num_threads > num_proc) return "too many threads"; + if (static_cast(num_threads) > num_proc) return "too many threads"; // Read CPU info. // Uncertain if any modern system has gaps in the processor id (Solaris diff --git a/include/verilated_threads.h b/include/verilated_threads.h index 643ebcf0b..f39015971 100644 --- a/include/verilated_threads.h +++ b/include/verilated_threads.h @@ -241,8 +241,6 @@ public: private: VL_UNCOPYABLE(VlThreadPool); - // cppcheck-suppress unusedPrivateFunction - static bool isNumactlRunning(); std::string numaAssign(); }; diff --git a/include/verilatedos.h b/include/verilatedos.h index 2049eb0cf..7fd03f1e8 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -643,6 +643,16 @@ extern std::string getenvStr(const std::string& envvar, /// Return currently executing processor number; may do an OS call underneath so slow extern uint16_t getcpu() VL_MT_SAFE; +/// Return number of processors available to the current process. This might be +/// less than the number of logical processors in the machine, if a processor +/// affinity mask was used, e.g. via 'numactl -C 0-3'. Returns 0 if cannot +/// be determiend. +extern unsigned getProcessAvailableParallelism() VL_MT_SAFE; + +/// Return getProcessAvailableParallelism if non-zero, otherwise the number of +/// hardware threads in the host machine. +extern unsigned getProcessDefaultParallelism() VL_MT_SAFE; + /// Return memory usage in bytes, or 0 if unknown extern void memUsageBytes(uint64_t& peakr, uint64_t& currentr) VL_MT_SAFE; diff --git a/include/verilatedos_c.h b/include/verilatedos_c.h index 849419701..cf1588971 100644 --- a/include/verilatedos_c.h +++ b/include/verilatedos_c.h @@ -104,6 +104,34 @@ uint16_t getcpu() VL_MT_SAFE { #endif } +//============================================================================= +// Vlos::getProcessAvailableParallelism implementation + +unsigned getProcessAvailableParallelism() VL_MT_SAFE { +#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (rc == 0) { + unsigned nCpus = 0; + for (int i = 0; i < CPU_SETSIZE; ++i) { + if (CPU_ISSET(i, &cpuset)) ++nCpus; + } + return nCpus; + } +#endif + // Cannot determine + return 0; +} + +//============================================================================= +// Vlos::getProcessDefaultParallelism implementation + +unsigned getProcessDefaultParallelism() VL_MT_SAFE { + const unsigned n = getProcessAvailableParallelism(); + return n ? n : std::thread::hardware_concurrency(); +} + //========================================================================= // VlOs::memPeakUsageBytes implementation diff --git a/nodist/install_test b/nodist/install_test index e345f9474..67c0fe68e 100755 --- a/nodist/install_test +++ b/nodist/install_test @@ -3,7 +3,6 @@ ###################################################################### import argparse -import multiprocessing import os import shutil import subprocess @@ -93,7 +92,7 @@ def cleanenv(): def calc_jobs(): - return multiprocessing.cpu_count() + 1 + return len(os.sched_getaffinity(0)) + 1 def run(command): diff --git a/src/V3Options.cpp b/src/V3Options.cpp index 46b1221c5..1b38796d0 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -1267,7 +1267,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, << "' was passed"); val = 1; } else if (val == 0) { - val = std::thread::hardware_concurrency(); + val = VlOs::getProcessDefaultParallelism(); } m_buildJobs = val; }); @@ -1781,7 +1781,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, << valp << "' was passed"); val = 1; } else if (val == 0) { - val = std::thread::hardware_concurrency(); + val = VlOs::getProcessDefaultParallelism(); } m_verilateJobs = val; }); @@ -1932,7 +1932,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, int val = 0; if (i < argc && std::isdigit(argv[i][0])) { val = std::atoi(argv[i]); // Can't be negative due to isdigit above - if (val == 0) val = std::thread::hardware_concurrency(); + if (val == 0) val = VlOs::getProcessDefaultParallelism(); ++i; } if (m_buildJobs == -1) m_buildJobs = val; diff --git a/test_regress/driver.py b/test_regress/driver.py index 13f247e74..b7baec517 100755 --- a/test_regress/driver.py +++ b/test_regress/driver.py @@ -2763,7 +2763,7 @@ def _calc_hashset() -> list: @lru_cache(maxsize=1) def max_procs() -> int: - procs = multiprocessing.cpu_count() + procs = len(os.sched_getaffinity(0)) if procs < 2: print("driver.py: Python didn't find at least two CPUs") return procs diff --git a/test_regress/t/t_a7_hier_block_cmake.py b/test_regress/t/t_a7_hier_block_cmake.py index 508e8b999..fac3ee154 100755 --- a/test_regress/t/t_a7_hier_block_cmake.py +++ b/test_regress/t/t_a7_hier_block_cmake.py @@ -8,7 +8,7 @@ # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 import vltest_bootstrap -import multiprocessing +import os # If a test fails, broken .cmake may disturb the next run test.clean_objs() @@ -30,7 +30,7 @@ test.run(logfile=test.obj_dir + "/cmake.log", test.run(logfile=test.obj_dir + "/build.log", cmd=[ 'cd "' + test.obj_dir + '" && cmake --build', '.', ('-v' if test.verbose else ''), - '-j ' + str(multiprocessing.cpu_count()), '--', "CXX_FLAGS=" + str(threads) + '-j ' + str(len(os.sched_getaffinity(0))), '--', "CXX_FLAGS=" + str(threads) ]) test.run(logfile=test.obj_dir + "/run.log", diff --git a/test_regress/t/t_threads_crazy_context.py b/test_regress/t/t_threads_crazy_context.py index 6ef9e273d..ae52d0316 100755 --- a/test_regress/t/t_threads_crazy_context.py +++ b/test_regress/t/t_threads_crazy_context.py @@ -19,7 +19,7 @@ test.execute() if test.vltmt: test.file_grep( test.run_log_filename, - r'System has \d+ hardware threads but simulation thread count set to 1024\. This will likely cause significant slowdown\.' + r'Process has \d+ hardware threads available, but simulation thread count set to 1024\. This will likely cause significant slowdown\.' ) test.passes()