From 2ba96536e6030d77ef94b669e1699f879771564f Mon Sep 17 00:00:00 2001
From: Yangyu Chen <cyy@cyyself.name>
Date: Tue, 6 Jan 2026 23:20:57 +0800
Subject: [PATCH] Add VERILATOR_NUMA_STRATEGY environment variable (#6826)
 (#6880)

Signed-off-by: Yangyu Chen <cyy@cyyself.name>
---
 docs/guide/environment.rst     | 16 ++++++++++++++++
 docs/guide/simulating.rst      |  7 +++++--
 docs/guide/verilating.rst      | 16 ++++++++++++++++
 include/verilated_threads.cpp  |  6 ++++++
 test_regress/t/t_gantt_numa.py | 30 ++++++++++++++++++++++++++++++
 5 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/docs/guide/environment.rst b/docs/guide/environment.rst
index 35481e063..99ac7485d 100644
--- a/docs/guide/environment.rst
+++ b/docs/guide/environment.rst
@@ -1,6 +1,8 @@
 .. Copyright 2003-2026 by Wilson Snyder.
 .. SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 
+.. _Environment:
+
 Environment
 ===========
 
@@ -89,6 +91,20 @@ associated programs.
    If set, the command to run when using the :vlopt:`--gdb` option, such as
    "ddd". If not specified, it will use "gdb".
 
+.. option:: VERILATOR_NUMA_STRATEGY
+
+   If set, controls NUMA assignment strategy for Verilator's thread pool
+   for Verilated simulations at runtime.
+   Possible values are:
+
+   * Empty(``""``) or ``"default"``: Enables NUMA assignment that prioritizes
+     assigning Verilator threads to physical cores.
+
+   * ``"none"``: Disables NUMA assignment. Let the operating system handle
+     thread scheduling.
+
+   Other values may be supported in future releases.
+
 .. option:: VERILATOR_ROOT
 
    The ``VERILATOR_ROOT`` environment variable is used in several places:
diff --git a/docs/guide/simulating.rst b/docs/guide/simulating.rst
index 5ed589f20..d3da6e77f 100644
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@@ -85,8 +85,11 @@ above documentation for these options.
 
 If using Verilated multithreaded, consider overriding Verilator's default
 thread-to-processor assignment by using ``numactl``; see
-:ref:`Multithreading`. Also, consider using profile-guided optimization;
-see :ref:`Thread PGO`.
+:ref:`Multithreading`. If your OS can handle thread assignment for your
+design and hardware well, consider disabling Verilator's NUMA assignment by
+setting the :vlopt:`VERILATOR_NUMA_STRATEGY` environment variable to
+``none``; see :ref:`Environment`. Also, consider using profile-guided
+optimization; see :ref:`Thread PGO`.
 
 Minor Verilog code changes can also give big wins. You should not have any
 :option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can
diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst
index 39d2211c8..8ddc40742 100644
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@@ -285,6 +285,13 @@ schedules threads using multiple hyperthreads within the same physical
 core. If there is no affinity already set, on Linux only, Verilator
 attempts to set thread-to-processor affinity in a reasonable way.
 
+Some newer Linux kernels handle thread assignment well. If running
+Verilator on such a system, automatic thread affinity may not be
+beneficial and may even reduce performance. In this case, environment
+variable :vlopt:`VERILATOR_NUMA_STRATEGY` may be set to ``none`` to
+disable automatic thread affinity. For more information, refer to
+:ref:`Environment`.
+
 For best performance, use the :command:`numactl` program to (when the
 threading count fits) select unique physical cores on the same socket. The
 same applies for :vlopt:`--trace-threads` as well.
@@ -311,6 +318,15 @@ adjusted if you want another simulator to use, e.g., socket 1, or if you
 Verilated with a different number of threads. To see what CPUs are actually
 used, use :vlopt:`--prof-exec`.
 
+On Systems with multiple L3 clusters per socket (e.g., AMD EPYC or Ryzen),
+consider using :command:`lstopo` to determine the L3 cluster topology of
+the current system and :command:`numactl` to bind CPUs within a single L3
+cluster. This can improve performance for minimal communication latency
+between threads. Sometimes, for model's thread counts that are more than
+the core count per L3 cluster, using SMTs (hyperthreads) within a single L3
+cluster can have better performance than spreading across multiple L3
+clusters using physical cores only. Experimentation is recommended to find
+the best settings for underlying hardware and model characteristics.
 
 Multithreaded Verilog and Library Support
 -----------------------------------------
diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp
index f3149d50e..b758909be 100644
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@@ -147,6 +147,12 @@ VlThreadPool::~VlThreadPool() {
 
 std::string VlThreadPool::numaAssign() {
 #if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK)  // Linux-like pthreads
+    std::string numa_strategy = VlOs::getenvStr("VERILATOR_NUMA_STRATEGY", "default");
+    if (numa_strategy == "none") {
+        return "no NUMA assignment requested";
+    } else if (numa_strategy != "default" && numa_strategy != "") {
+        return "%Warning: unknown VERILATOR_NUMA_STRATEGY value '" + numa_strategy + "'";
+    }
     // Get number of processor available to the current process
     const unsigned num_proc = VlOs::getProcessAvailableParallelism();
     if (!num_proc) return "Can't determine number of available threads";
diff --git a/test_regress/t/t_gantt_numa.py b/test_regress/t/t_gantt_numa.py
index a58c53bda..9547c8014 100755
--- a/test_regress/t/t_gantt_numa.py
+++ b/test_regress/t/t_gantt_numa.py
@@ -45,4 +45,34 @@ for trial in range(0, trials):
         # False fails occasionally
         # test.file_grep_not(gantt_log, r'%Warning:')  # e.g. There were fewer CPUs (1) than threads (3).
 
+if sys.platform != "darwin":
+    # Test disabling NUMA assignment
+    gantt_log_numa_none = test.obj_dir + "/gantt_numa_none.log"
+    test.execute(run_env='VERILATOR_NUMA_STRATEGY=none',
+                 all_run_flags=[
+                     "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
+                     " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
+                 ])
+    test.run(cmd=[
+        os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
+        "/profile_exec.dat", "| tee " + gantt_log_numa_none
+    ])
+    test.file_grep(gantt_log_numa_none, r'NUMA status += no NUMA assignment requested')
+
+    # Test invalid NUMA assignment
+    gantt_log_numa_invalid = test.obj_dir + "/gantt_numa_invalid.log"
+    test.execute(run_env='VERILATOR_NUMA_STRATEGY=invalid_value',
+                 all_run_flags=[
+                     "+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
+                     " +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
+                 ])
+    test.run(cmd=[
+        os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
+        "/profile_exec.dat", "| tee " + gantt_log_numa_invalid
+    ])
+    # %Warning: unknown VERILATOR_NUMA_STRATEGY value 'invalid_value'
+    test.file_grep(
+        gantt_log_numa_invalid,
+        r"NUMA status += %Warning: unknown VERILATOR_NUMA_STRATEGY value 'invalid_value'")
+
 test.passes()