diff --git a/Changes b/Changes index bde74e7e6..e321419af 100644 --- a/Changes +++ b/Changes @@ -27,6 +27,7 @@ Verilator 5.035 devel * Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa] * Add empty veriuser.h for legacy compatibility. * Add DEPRECATED warning on `--xml-only` and `--xml-output`. +* Add numactl-like automatic assignment of processor affinity. * Remove unused gtkwave/wavealloca.h. [Geza Lore] * Optimize automatic splitting of some packed variables (#5843). [Geza Lore] * Optimize trigger vector in whole words (#5857). [Geza Lore] diff --git a/bin/verilator_gantt b/bin/verilator_gantt index b6a0a4d31..59655d464 100755 --- a/bin/verilator_gantt +++ b/bin/verilator_gantt @@ -16,7 +16,14 @@ LongestVcdStrValueLength = 0 Threads = collections.defaultdict(lambda: []) # List of records per thread id Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0}) Cpus = collections.defaultdict(lambda: {'mtask_time': 0}) -Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}} +Global = { + 'args': {}, + 'cpuinfo': collections.defaultdict(lambda: {}), + 'info': { + 'numa': 'no data' + }, + 'stats': {} +} ElapsedTime = None # total elapsed time ExecGraphTime = 0 # total elapsed time executing an exec graph ExecGraphIntervals = [] # list of (start, end) pairs @@ -33,7 +40,8 @@ def read_data(filename): re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*') re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$') - re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)') + re_info = re.compile(r'VLPROF info\s+(\S+)\s+(.*)$') + re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+(\S+)') re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$') re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None @@ -108,6 +116,9 @@ def read_data(filename): elif re_arg2.match(line): match = re_arg2.match(line) Global['args'][match.group(1)] = match.group(2) + elif re_info.match(line): + match = re_info.match(line) + Global['info'][match.group(1)] = match.group(2) elif re_stat.match(line): match = re_stat.match(line) Global['stats'][match.group(1)] = match.group(2) @@ -163,6 +174,7 @@ def report(): print(" Total mtasks = %d" % len(Mtasks)) print(" Total yields = %d" % int(Global['stats'].get('yields', 0))) + report_numa() report_mtasks() report_cpus() report_sections() @@ -183,6 +195,11 @@ def report(): print() +def report_numa(): + print("\nNUMA assignment:") + print(" NUMA status = %s" % Global['info']['numa']) + + def report_mtasks(): if not Mtasks: return diff --git a/docs/guide/simulating.rst b/docs/guide/simulating.rst index 49ba5c720..29231ef81 100644 --- a/docs/guide/simulating.rst +++ b/docs/guide/simulating.rst @@ -83,9 +83,10 @@ option will require a longer time to run Verilator, and may increase the risk of reset bugs in trade for performance; see the above documentation for these options. -If using Verilated multithreaded, use ``numactl`` to ensure you use -non-conflicting hardware resources. See :ref:`Multithreading`. Also, -consider using profile-guided optimization; see :ref:`Thread PGO`. +If using Verilated multithreaded, consider overriding Verilator's default +thread-to-processor assignment by using ``numactl``; see +:ref:`Multithreading`. Also, consider using profile-guided optimization; +see :ref:`Thread PGO`. Minor Verilog code changes can also give big wins. You should not have any :option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst index 6e63a93f2..96fce9057 100644 --- a/docs/guide/verilating.rst +++ b/docs/guide/verilating.rst @@ -243,11 +243,14 @@ trace. FST tracing can utilize up to 2 offload threads, so there is no use of setting :vlopt:`--trace-threads` higher than 2 at the moment. When running a multithreaded model, the default Linux task scheduler often -works against the model by assuming short-lived threads and thus -it often schedules threads using multiple hyperthreads within the same -physical core. For best performance, use the :command:`numactl` program to -(when the threading count fits) select unique physical cores on the same -socket. The same applies for :vlopt:`--trace-threads` as well. +works against the model by assuming short-lived threads and thus it often +schedules threads using multiple hyperthreads within the same physical +core. If there is no affinity already set, on Linux only, Verilator +attempts to set thread-to-processor affinity in a reasonable way. + +For best performance, use the :command:`numactl` program to (when the +threading count fits) select unique physical cores on the same socket. The +same applies for :vlopt:`--trace-threads` as well. As an example, if a model was Verilated with :vlopt:`--threads 4 <--threads>`, we consult: diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp index 6a7f7dc36..3d3a6d697 100644 --- a/include/verilated_profiler.cpp +++ b/include/verilated_profiler.cpp @@ -34,28 +34,6 @@ thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace; constexpr const char* const VlExecutionRecord::s_ascii[]; -//============================================================================= -// VlPgoProfiler implementation - -uint16_t VlExecutionRecord::getcpu() { -#if defined(__linux) - return sched_getcpu(); // TODO: this is a system call. Not exactly cheap. -#elif defined(__APPLE__) && !defined(__arm64__) - uint32_t info[4]; - __cpuid_count(1, 0, info[0], info[1], info[2], info[3]); - // info[1] is EBX, bits 24-31 are APIC ID - if ((info[3] & (1 << 9)) == 0) { - return -1; // no APIC on chip - } else { - return (unsigned)info[1] >> 24; - } -#elif defined(_WIN32) - return GetCurrentProcessorNumber(); -#else - return 0; -#endif -} - //============================================================================= // VlExecutionProfiler implementation @@ -161,11 +139,17 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd) // TODO Perhaps merge with verilated_coverage output format, so can // have a common merging and reporting tool, etc. - fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n"); + fprintf(fp, "VLPROFVERSION 2.2 # Verilator execution profile version 2.2\n"); fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n", Verilated::threadContextp()->profExecStart()); fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n", Verilated::threadContextp()->profExecWindow()); + std::string numa = "no threads"; + if (VlThreadPool* const threadPoolp + = static_cast(Verilated::threadContextp()->threadPoolp())) { + numa = threadPoolp->numaStatus(); + } + fprintf(fp, "VLPROF info numa %s\n", numa.c_str()); // Note that VerilatedContext will by default create as many threads as there are hardware // processors, but not all of them might be utilized. Report the actual number that has trace // entries to avoid over-counting. diff --git a/include/verilated_profiler.h b/include/verilated_profiler.h index db1361e8b..5023dcf07 100644 --- a/include/verilated_profiler.h +++ b/include/verilated_profiler.h @@ -105,8 +105,6 @@ class VlExecutionRecord final { static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed"); static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed"); - static uint16_t getcpu(); // Return currently executing CPU id - public: // CONSTRUCTOR VlExecutionRecord() = default; @@ -120,7 +118,7 @@ public: void mtaskBegin(uint32_t id, uint32_t predictStart) { m_payload.mtaskBegin.m_id = id; m_payload.mtaskBegin.m_predictStart = predictStart; - m_payload.mtaskBegin.m_cpu = getcpu(); + m_payload.mtaskBegin.m_cpu = VlOs::getcpu(); m_type = Type::MTASK_BEGIN; } void mtaskEnd(uint32_t id, uint32_t predictCost) { diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index 9f36f85e5..1d1f48796 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -26,6 +26,8 @@ #include "verilated_threads.h" #include +#include +#include #include #include @@ -104,9 +106,149 @@ VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) { m_workers.push_back(new VlWorkerThread{contextp}); m_unassignedWorkers.push(i); } + m_numaStatus = numaAssign(); } VlThreadPool::~VlThreadPool() { // Each ~WorkerThread will wait for its thread to exit. for (auto& i : m_workers) delete i; } + +bool VlThreadPool::isNumactlRunning() { + // We assume if current thread is CPU-masked, then under numactl, otherwise not. + // This shows that numactl is visible through the affinity mask +#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc + const unsigned num_cpus = std::thread::hardware_concurrency(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + if (rc != 0) return true; // Error; assuming returning true is the least-damage option + for (unsigned c = 0; c < std::min(num_cpus, static_cast(CPU_SETSIZE)); ++c) { + if (!CPU_ISSET(c, &cpuset)) return true; + } +#endif + return false; +} + +std::string VlThreadPool::numaAssign() { +#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc + // If not under numactl, make a reasonable processor affinity selection + if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity + const int num_threads = static_cast(m_workers.size()); + const int num_proc = static_cast(std::thread::hardware_concurrency()); + if (num_threads < 2) return "too few threads"; + if (num_threads > num_proc) return "too many threads"; + + // Read CPU info. + // Uncertain if any modern system has gaps in the processor id (Solaris + // did), but just in case use vectors instead of processor number math. + // + // Currently ignoring socket number "physical id". + // If processor numbers are sequential on sockets, algorithm works out ok. + // If processor numbers are strided on sockets, algorithm also works out ok. + std::ifstream is{"/proc/cpuinfo"}; + if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo"; + + std::vector unassigned_processors; // Processors to assign in sorted order + std::map processor_core; + std::multimap core_processors; + std::set cores; + int processor = -1; + int core = -1; + while (!is.eof()) { + std::string line; + std::getline(is, line); + static std::string::size_type pos = line.find(":"); + int number = -1; + if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1); + if (line.compare(0, std::strlen("processor"), "processor") == 0) { + processor = number; + core = -1; + } else if (line.compare(0, std::strlen("core id"), "core id") == 0) { + core = number; + // std::cout << "p" << processor << " socket " << socket << " c" << core << std::endl; + cores.emplace(core); + processor_core[processor] = core; + core_processors.emplace(core, processor); + unassigned_processors.push_back(processor); + } + } + + // Start scheduling on the current CPU + 1. + // This will help to land on the same socket as current CPU, and also + // help make sure that different processes have different masks (when + // num_threads is not a common-factor of the processor count). + std::sort(unassigned_processors.begin(), unassigned_processors.end()); + { + const int on_cpu = sched_getcpu(); // TODO: this is a system call. Not exactly cheap. + bool hit = false; + std::vector new_front; + std::vector new_back; + for (const int processor : unassigned_processors) { + if (hit) { + new_front.push_back(processor); + } else { + new_back.push_back(processor); + } + if (processor == on_cpu) hit = true; + } + unassigned_processors = new_front; + unassigned_processors.insert(unassigned_processors.end(), new_back.begin(), + new_back.end()); + } + + // If less threads than cores, we can schedule per-core + const bool core_per_thread = num_threads <= cores.size(); + + // Compute core mapping + std::multimap thread_processors; + { + std::set assigned_processors; + int thread = 0; + for (const int processor : unassigned_processors) { + // Find free processor, the current thread can use that + if (assigned_processors.find(processor) != assigned_processors.end()) continue; + assigned_processors.emplace(processor); + thread_processors.emplace(thread, processor); + if (core_per_thread) { + // Also include all other processors same core, + // so that another thread doesn't land on different processor in same core + const int core = processor_core[processor]; + const auto bounds = core_processors.equal_range(core); + for (auto it{bounds.first}; it != bounds.second; ++it) { + if (assigned_processors.find(it->second) != assigned_processors.end()) + continue; + if (it->second == processor) continue; + thread_processors.emplace(thread, it->second); + assigned_processors.emplace(it->second); + } + } + // Prepare for next loop + thread = (thread + 1) % num_threads; + } + } + + // Set affinity + std::string status = "assigned "; + for (int thread = 0; thread < num_threads; ++thread) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + const auto bounds = thread_processors.equal_range(thread); + for (auto it{bounds.first}; it != bounds.second; ++it) { + if (it != bounds.first) status += ','; + status += std::to_string(it->second); + CPU_SET(it->second, &cpuset); + } + status += ";"; + + const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(), + sizeof(cpu_set_t), &cpuset); + if (rc != 0) return "%Warning: pthread_setaffinity_np failed"; + } + // std::cout << "Status: " << status << std::endl; + return status; +#else + return "non-supported host OS"; +#endif +} diff --git a/include/verilated_threads.h b/include/verilated_threads.h index a2722b164..c2f05a480 100644 --- a/include/verilated_threads.h +++ b/include/verilated_threads.h @@ -34,15 +34,6 @@ #include #include -// clang-format off -#if defined(__linux) -# include // For sched_getcpu() -#endif -#if defined(__APPLE__) && !defined(__arm64__) -# include // For __cpuid_count() -#endif -// clang-format on - class VlExecutionProfiler; class VlThreadPool; @@ -156,6 +147,10 @@ private: VL_UNCOPYABLE(VlWorkerThread); +protected: + friend class VlThreadPool; + const std::thread& cthread() const { return m_cthread; } + public: // CONSTRUCTORS explicit VlWorkerThread(VerilatedContext* contextp); @@ -206,12 +201,12 @@ class VlThreadPool final : public VerilatedVirtualBase { // MEMBERS std::vector m_workers; // our workers - // Guards indexes of unassigned workers - mutable VerilatedMutex m_mutex; + mutable VerilatedMutex m_mutex; // Guards indexes of unassigned workers // Indexes of unassigned workers std::stack m_unassignedWorkers VL_GUARDED_BY(m_mutex); - // Used for sequentially generating task IDs to avoid shadowing + // For sequentially generating task IDs to avoid shadowing std::atomic m_assignedTasks{0}; + std::string m_numaStatus; // Status of NUMA assignment public: // CONSTRUCTORS @@ -236,6 +231,7 @@ public: } unsigned assignTaskIndex() { return m_assignedTasks++; } int numThreads() const { return static_cast(m_workers.size()); } + std::string numaStatus() const { return m_numaStatus; } VlWorkerThread* workerp(int index) { assert(index >= 0); assert(index < static_cast(m_workers.size())); @@ -244,6 +240,9 @@ public: private: VL_UNCOPYABLE(VlThreadPool); + + static bool isNumactlRunning(); + std::string numaAssign(); }; #endif diff --git a/include/verilatedos.h b/include/verilatedos.h index be0cd59a2..6e47e85db 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -632,7 +632,12 @@ namespace VlOs { /// Get environment variable extern std::string getenvStr(const std::string& envvar, const std::string& defaultValue) VL_MT_SAFE; -extern uint64_t memUsageBytes() VL_MT_SAFE; ///< Return memory usage in bytes, or 0 if unknown + +/// Return currently executing processor number; may do an OS call underneath so slow +extern uint16_t getcpu() VL_MT_SAFE; + +/// Return memory usage in bytes, or 0 if unknown +extern uint64_t memUsageBytes() VL_MT_SAFE; // Internal: Record CPU time, starting point on construction, and current delta from that class DeltaCpuTime final { diff --git a/include/verilatedos_c.h b/include/verilatedos_c.h index be932befb..24edda4c0 100644 --- a/include/verilatedos_c.h +++ b/include/verilatedos_c.h @@ -31,6 +31,13 @@ # include // GetProcessTimes # include // GetProcessMemoryInfo #endif + +#if defined(__linux) +# include // For sched_getcpu() +#endif +#if defined(__APPLE__) && !defined(__arm64__) +# include // For __cpuid_count() +#endif // clang-format on namespace VlOs { @@ -72,6 +79,28 @@ double DeltaWallTime::gettime() VL_MT_SAFE { #endif } +//============================================================================= +// Vlos::getcpu implementation + +uint16_t getcpu() VL_MT_SAFE { +#if defined(__linux) + return sched_getcpu(); // TODO: this is a system call. Not exactly cheap. +#elif defined(__APPLE__) && !defined(__arm64__) + uint32_t info[4]; + __cpuid_count(1, 0, info[0], info[1], info[2], info[3]); + // info[1] is EBX, bits 24-31 are APIC ID + if ((info[3] & (1 << 9)) == 0) { + return 0; // no APIC on chip + } else { + return (unsigned)info[1] >> 24; + } +#elif defined(_WIN32) + return GetCurrentProcessorNumber(); +#else + return 0; +#endif +} + //========================================================================= // VlOs::memUsageBytes implementation diff --git a/test_regress/t/t_gantt_io.dat b/test_regress/t/t_gantt_io.dat index 7baa894b0..c5c97d676 100644 --- a/test_regress/t/t_gantt_io.dat +++ b/test_regress/t/t_gantt_io.dat @@ -1,6 +1,7 @@ VLPROFVERSION 2.0 VLPROF arg +verilator+prof+exec+start+2 VLPROF arg +verilator+prof+exec+window+2 +VLPROF info numa 0,1,4,5;2,3,6,7 VLPROF stat yields 0 VLPROF stat threads 2 VLPROFPROC processor : 0 diff --git a/test_regress/t/t_gantt_io.out b/test_regress/t/t_gantt_io.out index 14e5fcf5c..3a63ceec1 100644 --- a/test_regress/t/t_gantt_io.out +++ b/test_regress/t/t_gantt_io.out @@ -12,6 +12,9 @@ Summary: Total mtasks = 7 Total yields = 0 +NUMA assignment: + NUMA status = 0,1,4,5;2,3,6,7 + Parallelized code, measured: Thread utilization = 14.22% Speedup = 0.284x diff --git a/test_regress/t/t_gantt_io_arm.dat b/test_regress/t/t_gantt_io_arm.dat index 04376808a..4fd34cfc9 100644 --- a/test_regress/t/t_gantt_io_arm.dat +++ b/test_regress/t/t_gantt_io_arm.dat @@ -1,6 +1,7 @@ VLPROFVERSION 2.0 VLPROF arg +verilator+prof+exec+start+1 VLPROF arg +verilator+prof+exec+window+2 +VLPROF info numa 0,2;1,3 VLPROF stat threads 2 VLPROF stat yields 51 VLPROFPROC processor : 0 diff --git a/test_regress/t/t_gantt_io_arm.out b/test_regress/t/t_gantt_io_arm.out index 533bcde25..854b5ad77 100644 --- a/test_regress/t/t_gantt_io_arm.out +++ b/test_regress/t/t_gantt_io_arm.out @@ -12,6 +12,9 @@ Summary: Total mtasks = 5 Total yields = 51 +NUMA assignment: + NUMA status = 0,2;1,3 + Parallelized code, measured: Thread utilization = 42.50% Speedup = 0.85x diff --git a/test_regress/t/t_gantt_io_noproc.out b/test_regress/t/t_gantt_io_noproc.out index 0f50cfc39..86e4241f6 100644 --- a/test_regress/t/t_gantt_io_noproc.out +++ b/test_regress/t/t_gantt_io_noproc.out @@ -12,6 +12,9 @@ Summary: Total mtasks = 7 Total yields = 0 +NUMA assignment: + NUMA status = no data + Parallelized code, measured: Thread utilization = 14.22% Speedup = 0.284x diff --git a/test_regress/t/t_gantt_numa.py b/test_regress/t/t_gantt_numa.py new file mode 100755 index 000000000..572cef073 --- /dev/null +++ b/test_regress/t/t_gantt_numa.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2024 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +# Test for bin/verilator_gantt, + +import vltest_bootstrap + +test.scenarios('vltmt') +test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles + +test.compile( + v_flags2=["--prof-exec"], + # Checks below care about thread count + threads=4) + +# We need several experiments to make sure that the algorithm is working +trials = 4 +for trial in range(0, trials): + print("--------- Trial %d" % trial) + + test.execute( # Test fail: run_env='numactl -m 0 -C 0,0,0,0', + all_run_flags=[ + "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2", + " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat" + ]) + + gantt_log = test.obj_dir + "/gantt.log" + + test.run(cmd=[ + os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir + + "/profile_exec.dat", "| tee " + gantt_log + ]) + + test.file_grep(gantt_log, r'CPU info:') + test.file_grep(gantt_log, r'NUMA status += assigned') + test.file_grep_not(gantt_log, r'%Warning:') # e.g. There were fewer CPUs (1) than threads (3). + +test.passes()