2018-07-23 02:54:28 +02:00
|
|
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
|
|
|
|
//=============================================================================
|
|
|
|
|
//
|
2021-03-20 22:46:00 +01:00
|
|
|
// Code available from: https://verilator.org
|
|
|
|
|
//
|
2025-01-01 14:30:25 +01:00
|
|
|
// Copyright 2012-2025 by Wilson Snyder. This program is free software; you can
|
2020-03-21 16:24:24 +01:00
|
|
|
// redistribute it and/or modify it under the terms of either the GNU
|
|
|
|
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
|
|
|
|
// Version 2.0.
|
|
|
|
|
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
2018-07-23 02:54:28 +02:00
|
|
|
//
|
|
|
|
|
//=============================================================================
|
|
|
|
|
///
|
|
|
|
|
/// \file
|
2021-03-20 22:46:00 +01:00
|
|
|
/// \brief Verilated thread pool implementation code
|
|
|
|
|
///
|
|
|
|
|
/// This file must be compiled and linked against all Verilated objects
|
|
|
|
|
/// that use --threads.
|
|
|
|
|
///
|
|
|
|
|
/// Use "verilator --threads" to add this to the Makefile for the linker.
|
2018-07-23 02:54:28 +02:00
|
|
|
///
|
|
|
|
|
//=============================================================================
|
|
|
|
|
|
|
|
|
|
#include "verilatedos.h"
|
2022-08-05 11:56:57 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
#include "verilated_threads.h"
|
2018-10-14 19:43:24 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
#include <cstdio>
|
2025-04-02 14:27:23 +02:00
|
|
|
#include <fstream>
|
|
|
|
|
#include <iostream>
|
2022-01-09 22:49:38 +01:00
|
|
|
#include <memory>
|
|
|
|
|
#include <string>
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2025-05-21 01:46:07 +02:00
|
|
|
#ifdef __FreeBSD__
|
2025-05-21 01:46:57 +02:00
|
|
|
#include <pthread_np.h>
|
2025-05-21 01:46:07 +02:00
|
|
|
#endif
|
|
|
|
|
|
2021-03-05 01:23:40 +01:00
|
|
|
//=============================================================================
|
|
|
|
|
// Globals
|
|
|
|
|
|
|
|
|
|
// Internal note: Globals may multi-construct, see verilated.cpp top.
|
|
|
|
|
|
2022-03-27 21:27:40 +02:00
|
|
|
std::atomic<uint64_t> VlMTaskVertex::s_yields;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
//=============================================================================
|
|
|
|
|
// VlMTaskVertex
|
|
|
|
|
|
2022-03-27 21:27:40 +02:00
|
|
|
VlMTaskVertex::VlMTaskVertex(uint32_t upstreamDepCount)
|
2020-08-16 15:55:36 +02:00
|
|
|
: m_upstreamDepsDone{0}
|
|
|
|
|
, m_upstreamDepCount{upstreamDepCount} {
|
2018-07-23 02:54:28 +02:00
|
|
|
assert(atomic_is_lock_free(&m_upstreamDepsDone));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//=============================================================================
|
|
|
|
|
// VlWorkerThread
|
|
|
|
|
|
2022-07-12 12:41:15 +02:00
|
|
|
VlWorkerThread::VlWorkerThread(VerilatedContext* contextp)
|
2021-07-25 19:38:27 +02:00
|
|
|
: m_ready_size{0}
|
2025-11-23 02:13:46 +01:00
|
|
|
, m_contextp{contextp} {
|
|
|
|
|
#ifdef VL_USE_PTHREADS
|
|
|
|
|
// Init attributes
|
|
|
|
|
pthread_attr_t attr;
|
|
|
|
|
pthread_attr_init(&attr);
|
|
|
|
|
// Attempt to use the same stack size as the current (main) thread if possible
|
|
|
|
|
const size_t stacksize = pthread_get_stacksize_np(pthread_self());
|
|
|
|
|
if (!stacksize || pthread_attr_setstacksize(&attr, stacksize)) {
|
|
|
|
|
// Fall back on default atributes if failed to get/set stack size
|
|
|
|
|
pthread_attr_destroy(&attr);
|
|
|
|
|
pthread_attr_init(&attr);
|
|
|
|
|
}
|
|
|
|
|
// Create thread
|
|
|
|
|
if (pthread_create(&m_pthread, &attr, &VlWorkerThread::start, this)) {
|
|
|
|
|
std::cerr << "pthread_create failed" << std::endl;
|
|
|
|
|
std::abort();
|
|
|
|
|
}
|
|
|
|
|
// Destroy attributes
|
|
|
|
|
pthread_attr_destroy(&attr);
|
|
|
|
|
#else
|
|
|
|
|
m_cthread = std::thread(start, this);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
VlWorkerThread::~VlWorkerThread() {
|
2022-06-27 15:16:20 +02:00
|
|
|
shutdown();
|
2018-07-23 02:54:28 +02:00
|
|
|
// The thread should exit; join it.
|
2025-11-23 02:13:46 +01:00
|
|
|
#ifdef VL_USE_PTHREADS
|
|
|
|
|
pthread_join(m_pthread, nullptr);
|
|
|
|
|
#else
|
2018-07-23 02:54:28 +02:00
|
|
|
m_cthread.join();
|
2025-11-23 02:13:46 +01:00
|
|
|
#endif
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2022-10-03 16:57:37 +02:00
|
|
|
static void shutdownTask(void*, bool) { // LCOV_EXCL_LINE
|
2022-06-27 15:16:20 +02:00
|
|
|
// Deliberately empty, we use the address of this function as a magic number
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-12 12:41:15 +02:00
|
|
|
void VlWorkerThread::shutdown() { addTask(shutdownTask, nullptr); }
|
|
|
|
|
|
|
|
|
|
void VlWorkerThread::wait() {
|
|
|
|
|
// Enqueue a task that sets this flag. Execution is in-order so this ensures completion.
|
|
|
|
|
std::atomic<bool> flag{false};
|
|
|
|
|
addTask([](void* flagp, bool) { static_cast<std::atomic<bool>*>(flagp)->store(true); }, &flag);
|
|
|
|
|
// Spin wait
|
|
|
|
|
for (unsigned i = 0; i < VL_LOCK_SPINS; ++i) {
|
|
|
|
|
if (flag.load()) return;
|
|
|
|
|
VL_CPU_RELAX();
|
|
|
|
|
}
|
|
|
|
|
// Yield wait
|
|
|
|
|
while (!flag.load()) std::this_thread::yield();
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-23 02:13:46 +01:00
|
|
|
void VlWorkerThread::main() {
|
|
|
|
|
// Initialize thread_locals
|
|
|
|
|
Verilated::threadContextp(m_contextp);
|
|
|
|
|
// One work item
|
2018-07-23 02:54:28 +02:00
|
|
|
ExecRec work;
|
2022-07-12 12:41:15 +02:00
|
|
|
// Wait for the first task without spinning, in case the thread is never actually used.
|
|
|
|
|
dequeWork</* SpinWait: */ false>(&work);
|
2025-11-23 02:13:46 +01:00
|
|
|
// Loop until shutdown task is received
|
|
|
|
|
while (VL_UNLIKELY(work.m_fnp != shutdownTask)) {
|
2022-06-27 15:16:20 +02:00
|
|
|
work.m_fnp(work.m_selfp, work.m_evenCycle);
|
2022-07-12 12:41:15 +02:00
|
|
|
// Wait for next task with spinning.
|
|
|
|
|
dequeWork</* SpinWait: */ true>(&work);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-23 02:13:46 +01:00
|
|
|
void* VlWorkerThread::start(void* argp) {
|
|
|
|
|
reinterpret_cast<VlWorkerThread*>(argp)->main();
|
|
|
|
|
return nullptr;
|
2021-03-07 17:01:54 +01:00
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
//=============================================================================
|
|
|
|
|
// VlThreadPool
|
|
|
|
|
|
2022-07-12 12:41:15 +02:00
|
|
|
VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
|
2025-03-24 23:39:29 +01:00
|
|
|
for (unsigned i = 0; i < nThreads; ++i) {
|
|
|
|
|
m_workers.push_back(new VlWorkerThread{contextp});
|
|
|
|
|
m_unassignedWorkers.push(i);
|
|
|
|
|
}
|
2025-04-02 14:27:23 +02:00
|
|
|
m_numaStatus = numaAssign();
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VlThreadPool::~VlThreadPool() {
|
2020-11-11 03:40:14 +01:00
|
|
|
// Each ~WorkerThread will wait for its thread to exit.
|
|
|
|
|
for (auto& i : m_workers) delete i;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2025-04-02 14:27:23 +02:00
|
|
|
|
|
|
|
|
std::string VlThreadPool::numaAssign() {
|
2025-06-28 18:29:41 +02:00
|
|
|
#if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK) // Linux-like pthreads
|
2025-10-28 19:10:40 +01:00
|
|
|
// Get number of processor available to the current process
|
|
|
|
|
const unsigned num_proc = VlOs::getProcessAvailableParallelism();
|
|
|
|
|
if (!num_proc) return "Can't determine number of available threads";
|
|
|
|
|
// If fewer than hardware threads in the host, user presumably set affinity
|
|
|
|
|
if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";
|
|
|
|
|
|
|
|
|
|
// Make a reasonable processor affinity selection
|
2025-04-02 14:27:23 +02:00
|
|
|
const int num_threads = static_cast<int>(m_workers.size());
|
|
|
|
|
if (num_threads < 2) return "too few threads";
|
VlThreadPool: Improve too many threads check logic
As the thread pool is initialized for `m_threads - 1` workers, when
`m_threads` equals `the number of processors`, the `num_threads` will be
equal to `the number of processors - 1`, which is not needed to set
affinity.
This also improves backward compatibility. As mentioned in issue #6826,
when the new context API is not used, the default thread pool is created
with the number of workers equal to `the number of processors - 1`. In
this case, when using `--threads n` with `n` less than the number of
processors, the thread pool is still created with the `number of
processors - 1` workers, so we will even set wrong affinity, and let
some SMTs on the same core be used for emulator model threads, which
degrades performance. With this fix, the affinity will not be set in
this case.
Signed-off-by: Yangyu Chen <cyy@cyyself.name>
2025-12-19 09:50:01 +01:00
|
|
|
if (static_cast<unsigned>(num_threads) >= num_proc - 1) return "too many threads";
|
2025-04-02 14:27:23 +02:00
|
|
|
|
|
|
|
|
// Read CPU info.
|
|
|
|
|
// Uncertain if any modern system has gaps in the processor id (Solaris
|
|
|
|
|
// did), but just in case use vectors instead of processor number math.
|
|
|
|
|
//
|
|
|
|
|
// Currently ignoring socket number "physical id".
|
|
|
|
|
// If processor numbers are sequential on sockets, algorithm works out ok.
|
|
|
|
|
// If processor numbers are strided on sockets, algorithm also works out ok.
|
|
|
|
|
std::ifstream is{"/proc/cpuinfo"};
|
|
|
|
|
if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";
|
|
|
|
|
|
|
|
|
|
std::vector<int> unassigned_processors; // Processors to assign in sorted order
|
|
|
|
|
std::map<int, int> processor_core;
|
|
|
|
|
std::multimap<int, int> core_processors;
|
|
|
|
|
std::set<int> cores;
|
2025-06-28 18:29:41 +02:00
|
|
|
{
|
|
|
|
|
int processor = -1;
|
|
|
|
|
while (!is.eof()) {
|
|
|
|
|
std::string line;
|
|
|
|
|
std::getline(is, line);
|
2025-10-28 01:44:55 +01:00
|
|
|
std::string::size_type pos = line.find(":");
|
2025-06-28 18:29:41 +02:00
|
|
|
int number = -1;
|
|
|
|
|
if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
|
|
|
|
|
if (line.compare(0, std::strlen("processor"), "processor") == 0) {
|
|
|
|
|
processor = number;
|
|
|
|
|
} else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
|
|
|
|
|
const int core = number;
|
|
|
|
|
// std::cout << "p" << processor << " socket " << socket << " c" << core <<
|
|
|
|
|
// std::endl;
|
|
|
|
|
cores.emplace(core);
|
|
|
|
|
processor_core[processor] = core;
|
|
|
|
|
core_processors.emplace(core, processor);
|
|
|
|
|
unassigned_processors.push_back(processor);
|
|
|
|
|
}
|
2025-04-02 14:27:23 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Start scheduling on the current CPU + 1.
|
|
|
|
|
// This will help to land on the same socket as current CPU, and also
|
|
|
|
|
// help make sure that different processes have different masks (when
|
|
|
|
|
// num_threads is not a common-factor of the processor count).
|
|
|
|
|
std::sort(unassigned_processors.begin(), unassigned_processors.end());
|
|
|
|
|
{
|
|
|
|
|
const int on_cpu = sched_getcpu(); // TODO: this is a system call. Not exactly cheap.
|
|
|
|
|
bool hit = false;
|
|
|
|
|
std::vector<int> new_front;
|
|
|
|
|
std::vector<int> new_back;
|
|
|
|
|
for (const int processor : unassigned_processors) {
|
|
|
|
|
if (hit) {
|
|
|
|
|
new_front.push_back(processor);
|
|
|
|
|
} else {
|
|
|
|
|
new_back.push_back(processor);
|
|
|
|
|
}
|
|
|
|
|
if (processor == on_cpu) hit = true;
|
|
|
|
|
}
|
|
|
|
|
unassigned_processors = new_front;
|
|
|
|
|
unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
|
|
|
|
|
new_back.end());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If less threads than cores, we can schedule per-core
|
|
|
|
|
const bool core_per_thread = num_threads <= cores.size();
|
|
|
|
|
|
|
|
|
|
// Compute core mapping
|
|
|
|
|
std::multimap<int, int> thread_processors;
|
|
|
|
|
{
|
|
|
|
|
std::set<int> assigned_processors;
|
|
|
|
|
int thread = 0;
|
|
|
|
|
for (const int processor : unassigned_processors) {
|
|
|
|
|
// Find free processor, the current thread can use that
|
|
|
|
|
if (assigned_processors.find(processor) != assigned_processors.end()) continue;
|
|
|
|
|
assigned_processors.emplace(processor);
|
|
|
|
|
thread_processors.emplace(thread, processor);
|
|
|
|
|
if (core_per_thread) {
|
|
|
|
|
// Also include all other processors same core,
|
|
|
|
|
// so that another thread doesn't land on different processor in same core
|
|
|
|
|
const int core = processor_core[processor];
|
|
|
|
|
const auto bounds = core_processors.equal_range(core);
|
|
|
|
|
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
|
|
|
|
if (assigned_processors.find(it->second) != assigned_processors.end())
|
|
|
|
|
continue;
|
|
|
|
|
if (it->second == processor) continue;
|
|
|
|
|
thread_processors.emplace(thread, it->second);
|
|
|
|
|
assigned_processors.emplace(it->second);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Prepare for next loop
|
|
|
|
|
thread = (thread + 1) % num_threads;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Set affinity
|
|
|
|
|
std::string status = "assigned ";
|
|
|
|
|
for (int thread = 0; thread < num_threads; ++thread) {
|
|
|
|
|
cpu_set_t cpuset;
|
|
|
|
|
CPU_ZERO(&cpuset);
|
|
|
|
|
|
|
|
|
|
const auto bounds = thread_processors.equal_range(thread);
|
|
|
|
|
for (auto it{bounds.first}; it != bounds.second; ++it) {
|
|
|
|
|
if (it != bounds.first) status += ',';
|
|
|
|
|
status += std::to_string(it->second);
|
|
|
|
|
CPU_SET(it->second, &cpuset);
|
|
|
|
|
}
|
|
|
|
|
status += ";";
|
|
|
|
|
|
|
|
|
|
const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
|
|
|
|
|
sizeof(cpu_set_t), &cpuset);
|
|
|
|
|
if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
|
|
|
|
|
}
|
|
|
|
|
// std::cout << "Status: " << status << std::endl;
|
|
|
|
|
return status;
|
|
|
|
|
#else
|
|
|
|
|
return "non-supported host OS";
|
|
|
|
|
#endif
|
|
|
|
|
}
|