Add numactl-like automatic assignment of processor affinity (#5911)

2025-04-02 08:27:23 -04:00 · 2025-04-02 08:27:23 -04:00 · 6d1e82b908
parent cd5997a2e6
commit 6d1e82b908
16 changed files with 283 additions and 49 deletions
--- a/1
+++ b/1
@ -27,6 +27,7 @@ Verilator 5.035 devel
 * Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa]
 * Add empty veriuser.h for legacy compatibility.
 * Add DEPRECATED warning on `--xml-only` and `--xml-output`.
 * Add numactl-like automatic assignment of processor affinity.
 * Remove unused gtkwave/wavealloca.h. [Geza Lore]
 * Optimize automatic splitting of some packed variables (#5843). [Geza Lore]
 * Optimize trigger vector in whole words (#5857). [Geza Lore]
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -16,7 +16,14 @@ LongestVcdStrValueLength = 0
 Threads = collections.defaultdict(lambda: [])  # List of records per thread id
 Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
 Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
-Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}}
+Global = {
    'args': {},
    'cpuinfo': collections.defaultdict(lambda: {}),
    'info': {
        'numa': 'no data'
    },
    'stats': {}
 }
 ElapsedTime = None  # total elapsed time
 ExecGraphTime = 0  # total elapsed time executing an exec graph
 ExecGraphIntervals = []  # list of (start, end) pairs
@ -33,7 +40,8 @@ def read_data(filename):
        re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
        re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
-        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
+        re_info = re.compile(r'VLPROF info\s+(\S+)\s+(.*)$')
        re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+(\S+)')
        re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
        re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
        cpu = None
@ -108,6 +116,9 @@ def read_data(filename):
            elif re_arg2.match(line):
                match = re_arg2.match(line)
                Global['args'][match.group(1)] = match.group(2)
            elif re_info.match(line):
                match = re_info.match(line)
                Global['info'][match.group(1)] = match.group(2)
            elif re_stat.match(line):
                match = re_stat.match(line)
                Global['stats'][match.group(1)] = match.group(2)
@ -163,6 +174,7 @@ def report():
    print("  Total mtasks       = %d" % len(Mtasks))
    print("  Total yields       = %d" % int(Global['stats'].get('yields', 0)))
    report_numa()
    report_mtasks()
    report_cpus()
    report_sections()
@ -183,6 +195,11 @@ def report():
    print()
 def report_numa():
    print("\nNUMA assignment:")
    print("  NUMA status        = %s" % Global['info']['numa'])
 def report_mtasks():
    if not Mtasks:
        return
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@ -83,9 +83,10 @@ option will require a longer time to run Verilator, and
 may increase the risk of reset bugs in trade for performance; see the above
 documentation for these options.
-If using Verilated multithreaded, use ``numactl`` to ensure you use
+If using Verilated multithreaded, consider overriding Verilator's default
-non-conflicting hardware resources. See :ref:`Multithreading`. Also,
+thread-to-processor assignment by using ``numactl``; see
-consider using profile-guided optimization; see :ref:`Thread PGO`.
+:ref:`Multithreading`. Also, consider using profile-guided optimization;
 see :ref:`Thread PGO`.
 Minor Verilog code changes can also give big wins.  You should not have any
 :option:`UNOPTFLAT` warnings from Verilator.  Fixing these warnings can
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@ -243,11 +243,14 @@ trace. FST tracing can utilize up to 2 offload threads, so there is no use
 of setting :vlopt:`--trace-threads` higher than 2 at the moment.
 When running a multithreaded model, the default Linux task scheduler often
-works against the model by assuming short-lived threads and thus
+works against the model by assuming short-lived threads and thus it often
-it often schedules threads using multiple hyperthreads within the same
+schedules threads using multiple hyperthreads within the same physical
-physical core. For best performance, use the :command:`numactl` program to
+core. If there is no affinity already set, on Linux only, Verilator
-(when the threading count fits) select unique physical cores on the same
+attempts to set thread-to-processor affinity in a reasonable way.
-socket. The same applies for :vlopt:`--trace-threads` as well.
+
 For best performance, use the :command:`numactl` program to (when the
 threading count fits) select unique physical cores on the same socket. The
 same applies for :vlopt:`--trace-threads` as well.
 As an example, if a model was Verilated with
 :vlopt:`--threads 4 <--threads>`, we consult:
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@ -34,28 +34,6 @@ thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
 constexpr const char* const VlExecutionRecord::s_ascii[];
 //=============================================================================
 // VlPgoProfiler implementation
 uint16_t VlExecutionRecord::getcpu() {
 #if defined(__linux)
    return sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
 #elif defined(__APPLE__) && !defined(__arm64__)
    uint32_t info[4];
    __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
    // info[1] is EBX, bits 24-31 are APIC ID
    if ((info[3] & (1 << 9)) == 0) {
        return -1;  // no APIC on chip
    } else {
        return (unsigned)info[1] >> 24;
    }
 #elif defined(_WIN32)
    return GetCurrentProcessorNumber();
 #else
    return 0;
 #endif
 }
 //=============================================================================
 // VlExecutionProfiler implementation
@ -161,11 +139,17 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
-    fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
+    fprintf(fp, "VLPROFVERSION 2.2 # Verilator execution profile version 2.2\n");
    fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
            Verilated::threadContextp()->profExecStart());
    fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
            Verilated::threadContextp()->profExecWindow());
    std::string numa = "no threads";
    if (VlThreadPool* const threadPoolp
        = static_cast<VlThreadPool*>(Verilated::threadContextp()->threadPoolp())) {
        numa = threadPoolp->numaStatus();
    }
    fprintf(fp, "VLPROF info numa %s\n", numa.c_str());
    // Note that VerilatedContext will by default create as many threads as there are hardware
    // processors, but not all of them might be utilized. Report the actual number that has trace
    // entries to avoid over-counting.
--- a/include/verilated_profiler.h
+++ b/include/verilated_profiler.h
@ -105,8 +105,6 @@ class VlExecutionRecord final {
    static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
    static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
    static uint16_t getcpu();  // Return currently executing CPU id
 public:
    // CONSTRUCTOR
    VlExecutionRecord() = default;
@ -120,7 +118,7 @@ public:
    void mtaskBegin(uint32_t id, uint32_t predictStart) {
        m_payload.mtaskBegin.m_id = id;
        m_payload.mtaskBegin.m_predictStart = predictStart;
-        m_payload.mtaskBegin.m_cpu = getcpu();
+        m_payload.mtaskBegin.m_cpu = VlOs::getcpu();
        m_type = Type::MTASK_BEGIN;
    }
    void mtaskEnd(uint32_t id, uint32_t predictCost) {
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -26,6 +26,8 @@
 #include "verilated_threads.h"
 #include <cstdio>
 #include <fstream>
 #include <iostream>
 #include <memory>
 #include <string>
@ -104,9 +106,149 @@ VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
        m_workers.push_back(new VlWorkerThread{contextp});
        m_unassignedWorkers.push(i);
    }
    m_numaStatus = numaAssign();
 }
 VlThreadPool::~VlThreadPool() {
    // Each ~WorkerThread will wait for its thread to exit.
    for (auto& i : m_workers) delete i;
 }
 bool VlThreadPool::isNumactlRunning() {
    // We assume if current thread is CPU-masked, then under numactl, otherwise not.
    // This shows that numactl is visible through the affinity mask
 #if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
    const unsigned num_cpus = std::thread::hardware_concurrency();
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    if (rc != 0) return true;  // Error; assuming returning true is the least-damage option
    for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
        if (!CPU_ISSET(c, &cpuset)) return true;
    }
 #endif
    return false;
 }
 std::string VlThreadPool::numaAssign() {
 #if defined(__linux) || defined(CPU_ZERO)  // Linux-like; assume we have pthreads etc
    // If not under numactl, make a reasonable processor affinity selection
    if (isNumactlRunning()) return "running under numactl";  // User presumably set affinity
    const int num_threads = static_cast<int>(m_workers.size());
    const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
    if (num_threads < 2) return "too few threads";
    if (num_threads > num_proc) return "too many threads";
    // Read CPU info.
    // Uncertain if any modern system has gaps in the processor id (Solaris
    // did), but just in case use vectors instead of processor number math.
    //
    // Currently ignoring socket number "physical id".
    // If processor numbers are sequential on sockets, algorithm works out ok.
    // If processor numbers are strided on sockets, algorithm also works out ok.
    std::ifstream is{"/proc/cpuinfo"};
    if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";
    std::vector<int> unassigned_processors;  // Processors to assign in sorted order
    std::map<int, int> processor_core;
    std::multimap<int, int> core_processors;
    std::set<int> cores;
    int processor = -1;
    int core = -1;
    while (!is.eof()) {
        std::string line;
        std::getline(is, line);
        static std::string::size_type pos = line.find(":");
        int number = -1;
        if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
        if (line.compare(0, std::strlen("processor"), "processor") == 0) {
            processor = number;
            core = -1;
        } else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
            core = number;
            // std::cout << "p" << processor << " socket " << socket << " c" << core << std::endl;
            cores.emplace(core);
            processor_core[processor] = core;
            core_processors.emplace(core, processor);
            unassigned_processors.push_back(processor);
        }
    }
    // Start scheduling on the current CPU + 1.
    // This will help to land on the same socket as current CPU, and also
    // help make sure that different processes have different masks (when
    // num_threads is not a common-factor of the processor count).
    std::sort(unassigned_processors.begin(), unassigned_processors.end());
    {
        const int on_cpu = sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
        bool hit = false;
        std::vector<int> new_front;
        std::vector<int> new_back;
        for (const int processor : unassigned_processors) {
            if (hit) {
                new_front.push_back(processor);
            } else {
                new_back.push_back(processor);
            }
            if (processor == on_cpu) hit = true;
        }
        unassigned_processors = new_front;
        unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
                                     new_back.end());
    }
    // If less threads than cores, we can schedule per-core
    const bool core_per_thread = num_threads <= cores.size();
    // Compute core mapping
    std::multimap<int, int> thread_processors;
    {
        std::set<int> assigned_processors;
        int thread = 0;
        for (const int processor : unassigned_processors) {
            // Find free processor, the current thread can use that
            if (assigned_processors.find(processor) != assigned_processors.end()) continue;
            assigned_processors.emplace(processor);
            thread_processors.emplace(thread, processor);
            if (core_per_thread) {
                // Also include all other processors same core,
                // so that another thread doesn't land on different processor in same core
                const int core = processor_core[processor];
                const auto bounds = core_processors.equal_range(core);
                for (auto it{bounds.first}; it != bounds.second; ++it) {
                    if (assigned_processors.find(it->second) != assigned_processors.end())
                        continue;
                    if (it->second == processor) continue;
                    thread_processors.emplace(thread, it->second);
                    assigned_processors.emplace(it->second);
                }
            }
            // Prepare for next loop
            thread = (thread + 1) % num_threads;
        }
    }
    // Set affinity
    std::string status = "assigned ";
    for (int thread = 0; thread < num_threads; ++thread) {
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);
        const auto bounds = thread_processors.equal_range(thread);
        for (auto it{bounds.first}; it != bounds.second; ++it) {
            if (it != bounds.first) status += ',';
            status += std::to_string(it->second);
            CPU_SET(it->second, &cpuset);
        }
        status += ";";
        const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
                                              sizeof(cpu_set_t), &cpuset);
        if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
    }
    // std::cout << "Status: " << status << std::endl;
    return status;
 #else
    return "non-supported host OS";
 #endif
 }
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -34,15 +34,6 @@
 #include <thread>
 #include <vector>
 // clang-format off
 #if defined(__linux)
 # include <sched.h>  // For sched_getcpu()
 #endif
 #if defined(__APPLE__) && !defined(__arm64__)
 # include <cpuid.h>  // For __cpuid_count()
 #endif
 // clang-format on
 class VlExecutionProfiler;
 class VlThreadPool;
@ -156,6 +147,10 @@ private:
    VL_UNCOPYABLE(VlWorkerThread);
 protected:
    friend class VlThreadPool;
    const std::thread& cthread() const { return m_cthread; }
 public:
    // CONSTRUCTORS
    explicit VlWorkerThread(VerilatedContext* contextp);
@ -206,12 +201,12 @@ class VlThreadPool final : public VerilatedVirtualBase {
    // MEMBERS
    std::vector<VlWorkerThread*> m_workers;  // our workers
-    // Guards indexes of unassigned workers
+    mutable VerilatedMutex m_mutex;  // Guards indexes of unassigned workers
    mutable VerilatedMutex m_mutex;
    // Indexes of unassigned workers
    std::stack<size_t> m_unassignedWorkers VL_GUARDED_BY(m_mutex);
-    // Used for sequentially generating task IDs to avoid shadowing
+    // For sequentially generating task IDs to avoid shadowing
    std::atomic<unsigned> m_assignedTasks{0};
    std::string m_numaStatus;  // Status of NUMA assignment
 public:
    // CONSTRUCTORS
@ -236,6 +231,7 @@ public:
    }
    unsigned assignTaskIndex() { return m_assignedTasks++; }
    int numThreads() const { return static_cast<int>(m_workers.size()); }
    std::string numaStatus() const { return m_numaStatus; }
    VlWorkerThread* workerp(int index) {
        assert(index >= 0);
        assert(index < static_cast<int>(m_workers.size()));
@ -244,6 +240,9 @@ public:
 private:
    VL_UNCOPYABLE(VlThreadPool);
    static bool isNumactlRunning();
    std::string numaAssign();
 };
 #endif
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@ -632,7 +632,12 @@ namespace VlOs {
 /// Get environment variable
 extern std::string getenvStr(const std::string& envvar,
                             const std::string& defaultValue) VL_MT_SAFE;
-extern uint64_t memUsageBytes() VL_MT_SAFE;  ///< Return memory usage in bytes, or 0 if unknown
+
 /// Return currently executing processor number; may do an OS call underneath so slow
 extern uint16_t getcpu() VL_MT_SAFE;
 /// Return memory usage in bytes, or 0 if unknown
 extern uint64_t memUsageBytes() VL_MT_SAFE;
 // Internal: Record CPU time, starting point on construction, and current delta from that
 class DeltaCpuTime final {
--- a/include/verilatedos_c.h
+++ b/include/verilatedos_c.h
@ -31,6 +31,13 @@
 # include <processthreadsapi.h>  // GetProcessTimes
 # include <psapi.h>   // GetProcessMemoryInfo
 #endif
 #if defined(__linux)
 # include <sched.h>  // For sched_getcpu()
 #endif
 #if defined(__APPLE__) && !defined(__arm64__)
 # include <cpuid.h>  // For __cpuid_count()
 #endif
 // clang-format on
 namespace VlOs {
@ -72,6 +79,28 @@ double DeltaWallTime::gettime() VL_MT_SAFE {
 #endif
 }
 //=============================================================================
 // Vlos::getcpu implementation
 uint16_t getcpu() VL_MT_SAFE {
 #if defined(__linux)
    return sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
 #elif defined(__APPLE__) && !defined(__arm64__)
    uint32_t info[4];
    __cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
    // info[1] is EBX, bits 24-31 are APIC ID
    if ((info[3] & (1 << 9)) == 0) {
        return 0;  // no APIC on chip
    } else {
        return (unsigned)info[1] >> 24;
    }
 #elif defined(_WIN32)
    return GetCurrentProcessorNumber();
 #else
    return 0;
 #endif
 }
 //=========================================================================
 // VlOs::memUsageBytes implementation
--- a/test_regress/t/t_gantt_io.dat
+++ b/test_regress/t/t_gantt_io.dat
@ -1,6 +1,7 @@
 VLPROFVERSION 2.0
 VLPROF arg +verilator+prof+exec+start+2
 VLPROF arg +verilator+prof+exec+window+2
 VLPROF info numa 0,1,4,5;2,3,6,7
 VLPROF stat yields 0
 VLPROF stat threads 2
 VLPROFPROC processor    : 0
--- a/test_regress/t/t_gantt_io.out
+++ b/test_regress/t/t_gantt_io.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 7
  Total yields       = 0
 NUMA assignment:
  NUMA status        = 0,1,4,5;2,3,6,7
 Parallelized code, measured:
  Thread utilization =  14.22%
  Speedup            =  0.284x
--- a/test_regress/t/t_gantt_io_arm.dat
+++ b/test_regress/t/t_gantt_io_arm.dat
@ -1,6 +1,7 @@
 VLPROFVERSION 2.0
 VLPROF arg +verilator+prof+exec+start+1
 VLPROF arg +verilator+prof+exec+window+2
 VLPROF info numa 0,2;1,3
 VLPROF stat threads 2
 VLPROF stat yields 51
 VLPROFPROC processor    : 0
--- a/test_regress/t/t_gantt_io_arm.out
+++ b/test_regress/t/t_gantt_io_arm.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 5
  Total yields       = 51
 NUMA assignment:
  NUMA status        = 0,2;1,3
 Parallelized code, measured:
  Thread utilization =  42.50%
  Speedup            =   0.85x
--- a/test_regress/t/t_gantt_io_noproc.out
+++ b/test_regress/t/t_gantt_io_noproc.out
@ -12,6 +12,9 @@ Summary:
  Total mtasks       = 7
  Total yields       = 0
 NUMA assignment:
  NUMA status        = no data
 Parallelized code, measured:
  Thread utilization =  14.22%
  Speedup            =  0.284x
--- a/test_regress/t/t_gantt_numa.py
+++ b/test_regress/t/t_gantt_numa.py
@ -0,0 +1,44 @@
 #!/usr/bin/env python3
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2024 by Wilson Snyder. This program is free software; you
 # can redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 # Test for bin/verilator_gantt,
 import vltest_bootstrap
 test.scenarios('vltmt')
 test.top_filename = "t/t_gen_alw.v"  # Any, as long as runs a few cycles
 test.compile(
    v_flags2=["--prof-exec"],
    # Checks below care about thread count
    threads=4)
 # We need several experiments to make sure that the algorithm is working
 trials = 4
 for trial in range(0, trials):
    print("--------- Trial %d" % trial)
    test.execute(  # Test fail: run_env='numactl -m 0 -C 0,0,0,0',
        all_run_flags=[
            "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
            " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
        ])
    gantt_log = test.obj_dir + "/gantt.log"
    test.run(cmd=[
        os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
        "/profile_exec.dat", "| tee " + gantt_log
    ])
    test.file_grep(gantt_log, r'CPU info:')
    test.file_grep(gantt_log, r'NUMA status += assigned')
    test.file_grep_not(gantt_log, r'%Warning:')  # e.g. There were fewer CPUs (1) than threads (3).
 test.passes()