Add numactl-like automatic assignment of processor affinity (#5911)
This commit is contained in:
parent
cd5997a2e6
commit
6d1e82b908
1
Changes
1
Changes
|
|
@ -27,6 +27,7 @@ Verilator 5.035 devel
|
||||||
* Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa]
|
* Add `--make json` to enable integration with non-make/cmake build systems (#5799). [Andrew Voznytsa]
|
||||||
* Add empty veriuser.h for legacy compatibility.
|
* Add empty veriuser.h for legacy compatibility.
|
||||||
* Add DEPRECATED warning on `--xml-only` and `--xml-output`.
|
* Add DEPRECATED warning on `--xml-only` and `--xml-output`.
|
||||||
|
* Add numactl-like automatic assignment of processor affinity.
|
||||||
* Remove unused gtkwave/wavealloca.h. [Geza Lore]
|
* Remove unused gtkwave/wavealloca.h. [Geza Lore]
|
||||||
* Optimize automatic splitting of some packed variables (#5843). [Geza Lore]
|
* Optimize automatic splitting of some packed variables (#5843). [Geza Lore]
|
||||||
* Optimize trigger vector in whole words (#5857). [Geza Lore]
|
* Optimize trigger vector in whole words (#5857). [Geza Lore]
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,14 @@ LongestVcdStrValueLength = 0
|
||||||
Threads = collections.defaultdict(lambda: []) # List of records per thread id
|
Threads = collections.defaultdict(lambda: []) # List of records per thread id
|
||||||
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
|
Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0})
|
||||||
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
|
Cpus = collections.defaultdict(lambda: {'mtask_time': 0})
|
||||||
Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}}
|
Global = {
|
||||||
|
'args': {},
|
||||||
|
'cpuinfo': collections.defaultdict(lambda: {}),
|
||||||
|
'info': {
|
||||||
|
'numa': 'no data'
|
||||||
|
},
|
||||||
|
'stats': {}
|
||||||
|
}
|
||||||
ElapsedTime = None # total elapsed time
|
ElapsedTime = None # total elapsed time
|
||||||
ExecGraphTime = 0 # total elapsed time executing an exec graph
|
ExecGraphTime = 0 # total elapsed time executing an exec graph
|
||||||
ExecGraphIntervals = [] # list of (start, end) pairs
|
ExecGraphIntervals = [] # list of (start, end) pairs
|
||||||
|
|
@ -33,7 +40,8 @@ def read_data(filename):
|
||||||
|
|
||||||
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*')
|
||||||
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$')
|
||||||
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)')
|
re_info = re.compile(r'VLPROF info\s+(\S+)\s+(.*)$')
|
||||||
|
re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+(\S+)')
|
||||||
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$')
|
||||||
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
|
||||||
cpu = None
|
cpu = None
|
||||||
|
|
@ -108,6 +116,9 @@ def read_data(filename):
|
||||||
elif re_arg2.match(line):
|
elif re_arg2.match(line):
|
||||||
match = re_arg2.match(line)
|
match = re_arg2.match(line)
|
||||||
Global['args'][match.group(1)] = match.group(2)
|
Global['args'][match.group(1)] = match.group(2)
|
||||||
|
elif re_info.match(line):
|
||||||
|
match = re_info.match(line)
|
||||||
|
Global['info'][match.group(1)] = match.group(2)
|
||||||
elif re_stat.match(line):
|
elif re_stat.match(line):
|
||||||
match = re_stat.match(line)
|
match = re_stat.match(line)
|
||||||
Global['stats'][match.group(1)] = match.group(2)
|
Global['stats'][match.group(1)] = match.group(2)
|
||||||
|
|
@ -163,6 +174,7 @@ def report():
|
||||||
print(" Total mtasks = %d" % len(Mtasks))
|
print(" Total mtasks = %d" % len(Mtasks))
|
||||||
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
|
print(" Total yields = %d" % int(Global['stats'].get('yields', 0)))
|
||||||
|
|
||||||
|
report_numa()
|
||||||
report_mtasks()
|
report_mtasks()
|
||||||
report_cpus()
|
report_cpus()
|
||||||
report_sections()
|
report_sections()
|
||||||
|
|
@ -183,6 +195,11 @@ def report():
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def report_numa():
|
||||||
|
print("\nNUMA assignment:")
|
||||||
|
print(" NUMA status = %s" % Global['info']['numa'])
|
||||||
|
|
||||||
|
|
||||||
def report_mtasks():
|
def report_mtasks():
|
||||||
if not Mtasks:
|
if not Mtasks:
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -83,9 +83,10 @@ option will require a longer time to run Verilator, and
|
||||||
may increase the risk of reset bugs in trade for performance; see the above
|
may increase the risk of reset bugs in trade for performance; see the above
|
||||||
documentation for these options.
|
documentation for these options.
|
||||||
|
|
||||||
If using Verilated multithreaded, use ``numactl`` to ensure you use
|
If using Verilated multithreaded, consider overriding Verilator's default
|
||||||
non-conflicting hardware resources. See :ref:`Multithreading`. Also,
|
thread-to-processor assignment by using ``numactl``; see
|
||||||
consider using profile-guided optimization; see :ref:`Thread PGO`.
|
:ref:`Multithreading`. Also, consider using profile-guided optimization;
|
||||||
|
see :ref:`Thread PGO`.
|
||||||
|
|
||||||
Minor Verilog code changes can also give big wins. You should not have any
|
Minor Verilog code changes can also give big wins. You should not have any
|
||||||
:option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can
|
:option:`UNOPTFLAT` warnings from Verilator. Fixing these warnings can
|
||||||
|
|
|
||||||
|
|
@ -243,11 +243,14 @@ trace. FST tracing can utilize up to 2 offload threads, so there is no use
|
||||||
of setting :vlopt:`--trace-threads` higher than 2 at the moment.
|
of setting :vlopt:`--trace-threads` higher than 2 at the moment.
|
||||||
|
|
||||||
When running a multithreaded model, the default Linux task scheduler often
|
When running a multithreaded model, the default Linux task scheduler often
|
||||||
works against the model by assuming short-lived threads and thus
|
works against the model by assuming short-lived threads and thus it often
|
||||||
it often schedules threads using multiple hyperthreads within the same
|
schedules threads using multiple hyperthreads within the same physical
|
||||||
physical core. For best performance, use the :command:`numactl` program to
|
core. If there is no affinity already set, on Linux only, Verilator
|
||||||
(when the threading count fits) select unique physical cores on the same
|
attempts to set thread-to-processor affinity in a reasonable way.
|
||||||
socket. The same applies for :vlopt:`--trace-threads` as well.
|
|
||||||
|
For best performance, use the :command:`numactl` program to (when the
|
||||||
|
threading count fits) select unique physical cores on the same socket. The
|
||||||
|
same applies for :vlopt:`--trace-threads` as well.
|
||||||
|
|
||||||
As an example, if a model was Verilated with
|
As an example, if a model was Verilated with
|
||||||
:vlopt:`--threads 4 <--threads>`, we consult:
|
:vlopt:`--threads 4 <--threads>`, we consult:
|
||||||
|
|
|
||||||
|
|
@ -34,28 +34,6 @@ thread_local VlExecutionProfiler::ExecutionTrace VlExecutionProfiler::t_trace;
|
||||||
|
|
||||||
constexpr const char* const VlExecutionRecord::s_ascii[];
|
constexpr const char* const VlExecutionRecord::s_ascii[];
|
||||||
|
|
||||||
//=============================================================================
|
|
||||||
// VlPgoProfiler implementation
|
|
||||||
|
|
||||||
uint16_t VlExecutionRecord::getcpu() {
|
|
||||||
#if defined(__linux)
|
|
||||||
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
|
||||||
#elif defined(__APPLE__) && !defined(__arm64__)
|
|
||||||
uint32_t info[4];
|
|
||||||
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
|
|
||||||
// info[1] is EBX, bits 24-31 are APIC ID
|
|
||||||
if ((info[3] & (1 << 9)) == 0) {
|
|
||||||
return -1; // no APIC on chip
|
|
||||||
} else {
|
|
||||||
return (unsigned)info[1] >> 24;
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
return GetCurrentProcessorNumber();
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
//=============================================================================
|
//=============================================================================
|
||||||
// VlExecutionProfiler implementation
|
// VlExecutionProfiler implementation
|
||||||
|
|
||||||
|
|
@ -161,11 +139,17 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd)
|
||||||
|
|
||||||
// TODO Perhaps merge with verilated_coverage output format, so can
|
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||||
// have a common merging and reporting tool, etc.
|
// have a common merging and reporting tool, etc.
|
||||||
fprintf(fp, "VLPROFVERSION 2.1 # Verilator execution profile version 2.1\n");
|
fprintf(fp, "VLPROFVERSION 2.2 # Verilator execution profile version 2.2\n");
|
||||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
|
fprintf(fp, "VLPROF arg +verilator+prof+exec+start+%" PRIu64 "\n",
|
||||||
Verilated::threadContextp()->profExecStart());
|
Verilated::threadContextp()->profExecStart());
|
||||||
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
|
fprintf(fp, "VLPROF arg +verilator+prof+exec+window+%u\n",
|
||||||
Verilated::threadContextp()->profExecWindow());
|
Verilated::threadContextp()->profExecWindow());
|
||||||
|
std::string numa = "no threads";
|
||||||
|
if (VlThreadPool* const threadPoolp
|
||||||
|
= static_cast<VlThreadPool*>(Verilated::threadContextp()->threadPoolp())) {
|
||||||
|
numa = threadPoolp->numaStatus();
|
||||||
|
}
|
||||||
|
fprintf(fp, "VLPROF info numa %s\n", numa.c_str());
|
||||||
// Note that VerilatedContext will by default create as many threads as there are hardware
|
// Note that VerilatedContext will by default create as many threads as there are hardware
|
||||||
// processors, but not all of them might be utilized. Report the actual number that has trace
|
// processors, but not all of them might be utilized. Report the actual number that has trace
|
||||||
// entries to avoid over-counting.
|
// entries to avoid over-counting.
|
||||||
|
|
|
||||||
|
|
@ -105,8 +105,6 @@ class VlExecutionRecord final {
|
||||||
static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
|
static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
|
||||||
static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
|
static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
|
||||||
|
|
||||||
static uint16_t getcpu(); // Return currently executing CPU id
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// CONSTRUCTOR
|
// CONSTRUCTOR
|
||||||
VlExecutionRecord() = default;
|
VlExecutionRecord() = default;
|
||||||
|
|
@ -120,7 +118,7 @@ public:
|
||||||
void mtaskBegin(uint32_t id, uint32_t predictStart) {
|
void mtaskBegin(uint32_t id, uint32_t predictStart) {
|
||||||
m_payload.mtaskBegin.m_id = id;
|
m_payload.mtaskBegin.m_id = id;
|
||||||
m_payload.mtaskBegin.m_predictStart = predictStart;
|
m_payload.mtaskBegin.m_predictStart = predictStart;
|
||||||
m_payload.mtaskBegin.m_cpu = getcpu();
|
m_payload.mtaskBegin.m_cpu = VlOs::getcpu();
|
||||||
m_type = Type::MTASK_BEGIN;
|
m_type = Type::MTASK_BEGIN;
|
||||||
}
|
}
|
||||||
void mtaskEnd(uint32_t id, uint32_t predictCost) {
|
void mtaskEnd(uint32_t id, uint32_t predictCost) {
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,8 @@
|
||||||
#include "verilated_threads.h"
|
#include "verilated_threads.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
|
@ -104,9 +106,149 @@ VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
|
||||||
m_workers.push_back(new VlWorkerThread{contextp});
|
m_workers.push_back(new VlWorkerThread{contextp});
|
||||||
m_unassignedWorkers.push(i);
|
m_unassignedWorkers.push(i);
|
||||||
}
|
}
|
||||||
|
m_numaStatus = numaAssign();
|
||||||
}
|
}
|
||||||
|
|
||||||
VlThreadPool::~VlThreadPool() {
|
VlThreadPool::~VlThreadPool() {
|
||||||
// Each ~WorkerThread will wait for its thread to exit.
|
// Each ~WorkerThread will wait for its thread to exit.
|
||||||
for (auto& i : m_workers) delete i;
|
for (auto& i : m_workers) delete i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool VlThreadPool::isNumactlRunning() {
|
||||||
|
// We assume if current thread is CPU-masked, then under numactl, otherwise not.
|
||||||
|
// This shows that numactl is visible through the affinity mask
|
||||||
|
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
|
||||||
|
const unsigned num_cpus = std::thread::hardware_concurrency();
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
const int rc = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
|
||||||
|
if (rc != 0) return true; // Error; assuming returning true is the least-damage option
|
||||||
|
for (unsigned c = 0; c < std::min(num_cpus, static_cast<unsigned>(CPU_SETSIZE)); ++c) {
|
||||||
|
if (!CPU_ISSET(c, &cpuset)) return true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string VlThreadPool::numaAssign() {
|
||||||
|
#if defined(__linux) || defined(CPU_ZERO) // Linux-like; assume we have pthreads etc
|
||||||
|
// If not under numactl, make a reasonable processor affinity selection
|
||||||
|
if (isNumactlRunning()) return "running under numactl"; // User presumably set affinity
|
||||||
|
const int num_threads = static_cast<int>(m_workers.size());
|
||||||
|
const int num_proc = static_cast<int>(std::thread::hardware_concurrency());
|
||||||
|
if (num_threads < 2) return "too few threads";
|
||||||
|
if (num_threads > num_proc) return "too many threads";
|
||||||
|
|
||||||
|
// Read CPU info.
|
||||||
|
// Uncertain if any modern system has gaps in the processor id (Solaris
|
||||||
|
// did), but just in case use vectors instead of processor number math.
|
||||||
|
//
|
||||||
|
// Currently ignoring socket number "physical id".
|
||||||
|
// If processor numbers are sequential on sockets, algorithm works out ok.
|
||||||
|
// If processor numbers are strided on sockets, algorithm also works out ok.
|
||||||
|
std::ifstream is{"/proc/cpuinfo"};
|
||||||
|
if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";
|
||||||
|
|
||||||
|
std::vector<int> unassigned_processors; // Processors to assign in sorted order
|
||||||
|
std::map<int, int> processor_core;
|
||||||
|
std::multimap<int, int> core_processors;
|
||||||
|
std::set<int> cores;
|
||||||
|
int processor = -1;
|
||||||
|
int core = -1;
|
||||||
|
while (!is.eof()) {
|
||||||
|
std::string line;
|
||||||
|
std::getline(is, line);
|
||||||
|
static std::string::size_type pos = line.find(":");
|
||||||
|
int number = -1;
|
||||||
|
if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
|
||||||
|
if (line.compare(0, std::strlen("processor"), "processor") == 0) {
|
||||||
|
processor = number;
|
||||||
|
core = -1;
|
||||||
|
} else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
|
||||||
|
core = number;
|
||||||
|
// std::cout << "p" << processor << " socket " << socket << " c" << core << std::endl;
|
||||||
|
cores.emplace(core);
|
||||||
|
processor_core[processor] = core;
|
||||||
|
core_processors.emplace(core, processor);
|
||||||
|
unassigned_processors.push_back(processor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start scheduling on the current CPU + 1.
|
||||||
|
// This will help to land on the same socket as current CPU, and also
|
||||||
|
// help make sure that different processes have different masks (when
|
||||||
|
// num_threads is not a common-factor of the processor count).
|
||||||
|
std::sort(unassigned_processors.begin(), unassigned_processors.end());
|
||||||
|
{
|
||||||
|
const int on_cpu = sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
||||||
|
bool hit = false;
|
||||||
|
std::vector<int> new_front;
|
||||||
|
std::vector<int> new_back;
|
||||||
|
for (const int processor : unassigned_processors) {
|
||||||
|
if (hit) {
|
||||||
|
new_front.push_back(processor);
|
||||||
|
} else {
|
||||||
|
new_back.push_back(processor);
|
||||||
|
}
|
||||||
|
if (processor == on_cpu) hit = true;
|
||||||
|
}
|
||||||
|
unassigned_processors = new_front;
|
||||||
|
unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
|
||||||
|
new_back.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// If less threads than cores, we can schedule per-core
|
||||||
|
const bool core_per_thread = num_threads <= cores.size();
|
||||||
|
|
||||||
|
// Compute core mapping
|
||||||
|
std::multimap<int, int> thread_processors;
|
||||||
|
{
|
||||||
|
std::set<int> assigned_processors;
|
||||||
|
int thread = 0;
|
||||||
|
for (const int processor : unassigned_processors) {
|
||||||
|
// Find free processor, the current thread can use that
|
||||||
|
if (assigned_processors.find(processor) != assigned_processors.end()) continue;
|
||||||
|
assigned_processors.emplace(processor);
|
||||||
|
thread_processors.emplace(thread, processor);
|
||||||
|
if (core_per_thread) {
|
||||||
|
// Also include all other processors same core,
|
||||||
|
// so that another thread doesn't land on different processor in same core
|
||||||
|
const int core = processor_core[processor];
|
||||||
|
const auto bounds = core_processors.equal_range(core);
|
||||||
|
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
||||||
|
if (assigned_processors.find(it->second) != assigned_processors.end())
|
||||||
|
continue;
|
||||||
|
if (it->second == processor) continue;
|
||||||
|
thread_processors.emplace(thread, it->second);
|
||||||
|
assigned_processors.emplace(it->second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Prepare for next loop
|
||||||
|
thread = (thread + 1) % num_threads;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set affinity
|
||||||
|
std::string status = "assigned ";
|
||||||
|
for (int thread = 0; thread < num_threads; ++thread) {
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
|
||||||
|
const auto bounds = thread_processors.equal_range(thread);
|
||||||
|
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
||||||
|
if (it != bounds.first) status += ',';
|
||||||
|
status += std::to_string(it->second);
|
||||||
|
CPU_SET(it->second, &cpuset);
|
||||||
|
}
|
||||||
|
status += ";";
|
||||||
|
|
||||||
|
const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
|
||||||
|
sizeof(cpu_set_t), &cpuset);
|
||||||
|
if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
|
||||||
|
}
|
||||||
|
// std::cout << "Status: " << status << std::endl;
|
||||||
|
return status;
|
||||||
|
#else
|
||||||
|
return "non-supported host OS";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -34,15 +34,6 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// clang-format off
|
|
||||||
#if defined(__linux)
|
|
||||||
# include <sched.h> // For sched_getcpu()
|
|
||||||
#endif
|
|
||||||
#if defined(__APPLE__) && !defined(__arm64__)
|
|
||||||
# include <cpuid.h> // For __cpuid_count()
|
|
||||||
#endif
|
|
||||||
// clang-format on
|
|
||||||
|
|
||||||
class VlExecutionProfiler;
|
class VlExecutionProfiler;
|
||||||
class VlThreadPool;
|
class VlThreadPool;
|
||||||
|
|
||||||
|
|
@ -156,6 +147,10 @@ private:
|
||||||
|
|
||||||
VL_UNCOPYABLE(VlWorkerThread);
|
VL_UNCOPYABLE(VlWorkerThread);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
friend class VlThreadPool;
|
||||||
|
const std::thread& cthread() const { return m_cthread; }
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// CONSTRUCTORS
|
// CONSTRUCTORS
|
||||||
explicit VlWorkerThread(VerilatedContext* contextp);
|
explicit VlWorkerThread(VerilatedContext* contextp);
|
||||||
|
|
@ -206,12 +201,12 @@ class VlThreadPool final : public VerilatedVirtualBase {
|
||||||
// MEMBERS
|
// MEMBERS
|
||||||
std::vector<VlWorkerThread*> m_workers; // our workers
|
std::vector<VlWorkerThread*> m_workers; // our workers
|
||||||
|
|
||||||
// Guards indexes of unassigned workers
|
mutable VerilatedMutex m_mutex; // Guards indexes of unassigned workers
|
||||||
mutable VerilatedMutex m_mutex;
|
|
||||||
// Indexes of unassigned workers
|
// Indexes of unassigned workers
|
||||||
std::stack<size_t> m_unassignedWorkers VL_GUARDED_BY(m_mutex);
|
std::stack<size_t> m_unassignedWorkers VL_GUARDED_BY(m_mutex);
|
||||||
// Used for sequentially generating task IDs to avoid shadowing
|
// For sequentially generating task IDs to avoid shadowing
|
||||||
std::atomic<unsigned> m_assignedTasks{0};
|
std::atomic<unsigned> m_assignedTasks{0};
|
||||||
|
std::string m_numaStatus; // Status of NUMA assignment
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// CONSTRUCTORS
|
// CONSTRUCTORS
|
||||||
|
|
@ -236,6 +231,7 @@ public:
|
||||||
}
|
}
|
||||||
unsigned assignTaskIndex() { return m_assignedTasks++; }
|
unsigned assignTaskIndex() { return m_assignedTasks++; }
|
||||||
int numThreads() const { return static_cast<int>(m_workers.size()); }
|
int numThreads() const { return static_cast<int>(m_workers.size()); }
|
||||||
|
std::string numaStatus() const { return m_numaStatus; }
|
||||||
VlWorkerThread* workerp(int index) {
|
VlWorkerThread* workerp(int index) {
|
||||||
assert(index >= 0);
|
assert(index >= 0);
|
||||||
assert(index < static_cast<int>(m_workers.size()));
|
assert(index < static_cast<int>(m_workers.size()));
|
||||||
|
|
@ -244,6 +240,9 @@ public:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
VL_UNCOPYABLE(VlThreadPool);
|
VL_UNCOPYABLE(VlThreadPool);
|
||||||
|
|
||||||
|
static bool isNumactlRunning();
|
||||||
|
std::string numaAssign();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -632,7 +632,12 @@ namespace VlOs {
|
||||||
/// Get environment variable
|
/// Get environment variable
|
||||||
extern std::string getenvStr(const std::string& envvar,
|
extern std::string getenvStr(const std::string& envvar,
|
||||||
const std::string& defaultValue) VL_MT_SAFE;
|
const std::string& defaultValue) VL_MT_SAFE;
|
||||||
extern uint64_t memUsageBytes() VL_MT_SAFE; ///< Return memory usage in bytes, or 0 if unknown
|
|
||||||
|
/// Return currently executing processor number; may do an OS call underneath so slow
|
||||||
|
extern uint16_t getcpu() VL_MT_SAFE;
|
||||||
|
|
||||||
|
/// Return memory usage in bytes, or 0 if unknown
|
||||||
|
extern uint64_t memUsageBytes() VL_MT_SAFE;
|
||||||
|
|
||||||
// Internal: Record CPU time, starting point on construction, and current delta from that
|
// Internal: Record CPU time, starting point on construction, and current delta from that
|
||||||
class DeltaCpuTime final {
|
class DeltaCpuTime final {
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,13 @@
|
||||||
# include <processthreadsapi.h> // GetProcessTimes
|
# include <processthreadsapi.h> // GetProcessTimes
|
||||||
# include <psapi.h> // GetProcessMemoryInfo
|
# include <psapi.h> // GetProcessMemoryInfo
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux)
|
||||||
|
# include <sched.h> // For sched_getcpu()
|
||||||
|
#endif
|
||||||
|
#if defined(__APPLE__) && !defined(__arm64__)
|
||||||
|
# include <cpuid.h> // For __cpuid_count()
|
||||||
|
#endif
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
namespace VlOs {
|
namespace VlOs {
|
||||||
|
|
@ -72,6 +79,28 @@ double DeltaWallTime::gettime() VL_MT_SAFE {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//=============================================================================
|
||||||
|
// Vlos::getcpu implementation
|
||||||
|
|
||||||
|
uint16_t getcpu() VL_MT_SAFE {
|
||||||
|
#if defined(__linux)
|
||||||
|
return sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
||||||
|
#elif defined(__APPLE__) && !defined(__arm64__)
|
||||||
|
uint32_t info[4];
|
||||||
|
__cpuid_count(1, 0, info[0], info[1], info[2], info[3]);
|
||||||
|
// info[1] is EBX, bits 24-31 are APIC ID
|
||||||
|
if ((info[3] & (1 << 9)) == 0) {
|
||||||
|
return 0; // no APIC on chip
|
||||||
|
} else {
|
||||||
|
return (unsigned)info[1] >> 24;
|
||||||
|
}
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
return GetCurrentProcessorNumber();
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
//=========================================================================
|
//=========================================================================
|
||||||
// VlOs::memUsageBytes implementation
|
// VlOs::memUsageBytes implementation
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
VLPROFVERSION 2.0
|
VLPROFVERSION 2.0
|
||||||
VLPROF arg +verilator+prof+exec+start+2
|
VLPROF arg +verilator+prof+exec+start+2
|
||||||
VLPROF arg +verilator+prof+exec+window+2
|
VLPROF arg +verilator+prof+exec+window+2
|
||||||
|
VLPROF info numa 0,1,4,5;2,3,6,7
|
||||||
VLPROF stat yields 0
|
VLPROF stat yields 0
|
||||||
VLPROF stat threads 2
|
VLPROF stat threads 2
|
||||||
VLPROFPROC processor : 0
|
VLPROFPROC processor : 0
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,9 @@ Summary:
|
||||||
Total mtasks = 7
|
Total mtasks = 7
|
||||||
Total yields = 0
|
Total yields = 0
|
||||||
|
|
||||||
|
NUMA assignment:
|
||||||
|
NUMA status = 0,1,4,5;2,3,6,7
|
||||||
|
|
||||||
Parallelized code, measured:
|
Parallelized code, measured:
|
||||||
Thread utilization = 14.22%
|
Thread utilization = 14.22%
|
||||||
Speedup = 0.284x
|
Speedup = 0.284x
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
VLPROFVERSION 2.0
|
VLPROFVERSION 2.0
|
||||||
VLPROF arg +verilator+prof+exec+start+1
|
VLPROF arg +verilator+prof+exec+start+1
|
||||||
VLPROF arg +verilator+prof+exec+window+2
|
VLPROF arg +verilator+prof+exec+window+2
|
||||||
|
VLPROF info numa 0,2;1,3
|
||||||
VLPROF stat threads 2
|
VLPROF stat threads 2
|
||||||
VLPROF stat yields 51
|
VLPROF stat yields 51
|
||||||
VLPROFPROC processor : 0
|
VLPROFPROC processor : 0
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,9 @@ Summary:
|
||||||
Total mtasks = 5
|
Total mtasks = 5
|
||||||
Total yields = 51
|
Total yields = 51
|
||||||
|
|
||||||
|
NUMA assignment:
|
||||||
|
NUMA status = 0,2;1,3
|
||||||
|
|
||||||
Parallelized code, measured:
|
Parallelized code, measured:
|
||||||
Thread utilization = 42.50%
|
Thread utilization = 42.50%
|
||||||
Speedup = 0.85x
|
Speedup = 0.85x
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,9 @@ Summary:
|
||||||
Total mtasks = 7
|
Total mtasks = 7
|
||||||
Total yields = 0
|
Total yields = 0
|
||||||
|
|
||||||
|
NUMA assignment:
|
||||||
|
NUMA status = no data
|
||||||
|
|
||||||
Parallelized code, measured:
|
Parallelized code, measured:
|
||||||
Thread utilization = 14.22%
|
Thread utilization = 14.22%
|
||||||
Speedup = 0.284x
|
Speedup = 0.284x
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2024 by Wilson Snyder. This program is free software; you
|
||||||
|
# can redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||||
|
|
||||||
|
# Test for bin/verilator_gantt,
|
||||||
|
|
||||||
|
import vltest_bootstrap
|
||||||
|
|
||||||
|
test.scenarios('vltmt')
|
||||||
|
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
|
||||||
|
|
||||||
|
test.compile(
|
||||||
|
v_flags2=["--prof-exec"],
|
||||||
|
# Checks below care about thread count
|
||||||
|
threads=4)
|
||||||
|
|
||||||
|
# We need several experiments to make sure that the algorithm is working
|
||||||
|
trials = 4
|
||||||
|
for trial in range(0, trials):
|
||||||
|
print("--------- Trial %d" % trial)
|
||||||
|
|
||||||
|
test.execute( # Test fail: run_env='numactl -m 0 -C 0,0,0,0',
|
||||||
|
all_run_flags=[
|
||||||
|
"+verilator+prof+exec+start+2", " +verilator+prof+exec+window+2",
|
||||||
|
" +verilator+prof+exec+file+" + test.obj_dir + "/profile_exec.dat"
|
||||||
|
])
|
||||||
|
|
||||||
|
gantt_log = test.obj_dir + "/gantt.log"
|
||||||
|
|
||||||
|
test.run(cmd=[
|
||||||
|
os.environ["VERILATOR_ROOT"] + "/bin/verilator_gantt", "--no-vcd", test.obj_dir +
|
||||||
|
"/profile_exec.dat", "| tee " + gantt_log
|
||||||
|
])
|
||||||
|
|
||||||
|
test.file_grep(gantt_log, r'CPU info:')
|
||||||
|
test.file_grep(gantt_log, r'NUMA status += assigned')
|
||||||
|
test.file_grep_not(gantt_log, r'%Warning:') # e.g. There were fewer CPUs (1) than threads (3).
|
||||||
|
|
||||||
|
test.passes()
|
||||||
Loading…
Reference in New Issue