From 2ba96536e6030d77ef94b669e1699f879771564f Mon Sep 17 00:00:00 2001 From: Yangyu Chen Date: Tue, 6 Jan 2026 23:20:57 +0800 Subject: [PATCH] Add VERILATOR_NUMA_STRATEGY environment variable (#6826) (#6880) Signed-off-by: Yangyu Chen --- docs/guide/environment.rst | 16 ++++++++++++++++ docs/guide/simulating.rst | 7 +++++-- docs/guide/verilating.rst | 16 ++++++++++++++++ include/verilated_threads.cpp | 6 ++++++ test_regress/t/t_gantt_numa.py | 30 ++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+), 2 deletions(-) diff --git a/docs/guide/environment.rst b/docs/guide/environment.rst index 35481e063..99ac7485d 100644 --- a/docs/guide/environment.rst +++ b/docs/guide/environment.rst @@ -1,6 +1,8 @@ .. Copyright 2003-2026 by Wilson Snyder. .. SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +.. _Environment: + Environment =========== @@ -89,6 +91,20 @@ associated programs. If set, the command to run when using the :vlopt:`--gdb` option, such as "ddd". If not specified, it will use "gdb". +.. option:: VERILATOR_NUMA_STRATEGY + + If set, controls NUMA assignment strategy for Verilator's thread pool + for Verilated simulations at runtime. + Possible values are: + + * Empty(``""``) or ``"default"``: Enables NUMA assignment that prioritizes + assigning Verilator threads to physical cores. + + * ``"none"``: Disables NUMA assignment. Let the operating system handle + thread scheduling. + + Other values may be supported in future releases. + .. option:: VERILATOR_ROOT The ``VERILATOR_ROOT`` environment variable is used in several places: diff --git a/docs/guide/simulating.rst b/docs/guide/simulating.rst index 5ed589f20..d3da6e77f 100644 --- a/docs/guide/simulating.rst +++ b/docs/guide/simulating.rst @@ -85,8 +85,11 @@ above documentation for these options. If using Verilated multithreaded, consider overriding Verilator's default thread-to-processor assignment by using ``numactl``; see -:ref:`Multithreading`. Also, consider using profile-guided optimization; -see :ref:`Thread PGO`. +:ref:`Multithreading`. If your OS can handle thread assignment for your +design and hardware well, consider disabling Verilator's NUMA assignment by +setting the :vlopt:`VERILATOR_NUMA_STRATEGY` environment variable to +``none``; see :ref:`Environment`. Also, consider using profile-guided +optimization; see :ref:`Thread PGO`. Minor Verilog code changes can also give big wins. You should not have any :option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst index 39d2211c8..8ddc40742 100644 --- a/docs/guide/verilating.rst +++ b/docs/guide/verilating.rst @@ -285,6 +285,13 @@ schedules threads using multiple hyperthreads within the same physical core. If there is no affinity already set, on Linux only, Verilator attempts to set thread-to-processor affinity in a reasonable way. +Some newer Linux kernels handle thread assignment well. If running +Verilator on such a system, automatic thread affinity may not be +beneficial and may even reduce performance. In this case, environment +variable :vlopt:`VERILATOR_NUMA_STRATEGY` may be set to ``none`` to +disable automatic thread affinity. For more information, refer to +:ref:`Environment`. + For best performance, use the :command:`numactl` program to (when the threading count fits) select unique physical cores on the same socket. The same applies for :vlopt:`--trace-threads` as well. @@ -311,6 +318,15 @@ adjusted if you want another simulator to use, e.g., socket 1, or if you Verilated with a different number of threads. To see what CPUs are actually used, use :vlopt:`--prof-exec`. +On Systems with multiple L3 clusters per socket (e.g., AMD EPYC or Ryzen), +consider using :command:`lstopo` to determine the L3 cluster topology of +the current system and :command:`numactl` to bind CPUs within a single L3 +cluster. This can improve performance for minimal communication latency +between threads. Sometimes, for model's thread counts that are more than +the core count per L3 cluster, using SMTs (hyperthreads) within a single L3 +cluster can have better performance than spreading across multiple L3 +clusters using physical cores only. Experimentation is recommended to find +the best settings for underlying hardware and model characteristics. Multithreaded Verilog and Library Support ----------------------------------------- diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index f3149d50e..b758909be 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -147,6 +147,12 @@ VlThreadPool::~VlThreadPool() { std::string VlThreadPool::numaAssign() { #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads + std::string numa_strategy = VlOs::getenvStr("VERILATOR_NUMA_STRATEGY", "default"); + if (numa_strategy == "none") { + return "no NUMA assignment requested"; + } else if (numa_strategy != "default" && numa_strategy != "") { + return "%Warning: unknown VERILATOR_NUMA_STRATEGY value '" + numa_strategy + "'"; + } // Get number of processor available to the current process const unsigned num_proc = VlOs::getProcessAvailableParallelism(); if (!num_proc) return "Can't determine number of available threads"; diff --git a/test_regress/t/t_gantt_numa.py b/test_regress/t/t_gantt_numa.py index a58c53bda..9547c8014 100755 --- a/test_regress/t/t_gantt_numa.py +++ b/test_regress/t/t_gantt_numa.py @@ -45,4 +45,34 @@ for trial in range(0, trials): # False fails occasionally # test.file_grep_not(gantt_log, r'%Warning:') # e.g. There were fewer CPUs (1) than threads (3). +if sys.platform != "darwin": + # Test disabling NUMA assignment + gantt_log_numa_none = test.obj_dir + "/gantt_numa_none.log" + test.execute(run_env='VERILATOR_NUMA_STRATEGY=none', + all_run_flags=[ + "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2", + " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat" + ]) + test.run(cmd=[ + os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir + + "/profile_exec.dat", "| tee " + gantt_log_numa_none + ]) + test.file_grep(gantt_log_numa_none, r'NUMA status += no NUMA assignment requested') + + # Test invalid NUMA assignment + gantt_log_numa_invalid = test.obj_dir + "/gantt_numa_invalid.log" + test.execute(run_env='VERILATOR_NUMA_STRATEGY=invalid_value', + all_run_flags=[ + "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2", + " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat" + ]) + test.run(cmd=[ + os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir + + "/profile_exec.dat", "| tee " + gantt_log_numa_invalid + ]) + # %Warning: unknown VERILATOR_NUMA_STRATEGY value 'invalid_value' + test.file_grep( + gantt_log_numa_invalid, + r"NUMA status += %Warning: unknown VERILATOR_NUMA_STRATEGY value 'invalid_value'") + test.passes()