verilator/include/verilated_threads.cpp

// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2025 by Wilson Snyder. This program is free software; you can
// redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated thread pool implementation code
///
/// This file must be compiled and linked against all Verilated objects
/// that use --threads.
///
/// Use "verilator --threads" to add this to the Makefile for the linker.
///
//=============================================================================

#include "verilatedos.h"

#include "verilated_threads.h"

#include <cstdio>
#include <fstream>
#include <iostream>
#include <memory>
#include <string>

#ifdef __FreeBSD__
#include <pthread_np.h>
#endif

//=============================================================================
// Globals

// Internal note: Globals may multi-construct, see verilated.cpp top.

std::atomic<uint64_t> VlMTaskVertex::s_yields;

//=============================================================================
// VlMTaskVertex

VlMTaskVertex::VlMTaskVertex(uint32_t upstreamDepCount)
    : m_upstreamDepsDone{0}
    , m_upstreamDepCount{upstreamDepCount} {
    assert(atomic_is_lock_free(&m_upstreamDepsDone));
}

//=============================================================================
// VlWorkerThread

VlWorkerThread::VlWorkerThread(VerilatedContext* contextp)
    : m_ready_size{0}
    , m_contextp{contextp} {
#ifdef VL_USE_PTHREADS
    // Init attributes
    pthread_attr_t attr;
    pthread_attr_init(&attr);
    // Attempt to use the same stack size as the current (main) thread if possible
    const size_t stacksize = pthread_get_stacksize_np(pthread_self());
    if (!stacksize || pthread_attr_setstacksize(&attr, stacksize)) {
        // Fall back on default atributes if failed to get/set stack size
        pthread_attr_destroy(&attr);
        pthread_attr_init(&attr);
    }
    // Create thread
    if (pthread_create(&m_pthread, &attr, &VlWorkerThread::start, this)) {
        std::cerr << "pthread_create failed" << std::endl;
        std::abort();
    }
    // Destroy attributes
    pthread_attr_destroy(&attr);
#else
    m_cthread = std::thread(start, this);
#endif
}

VlWorkerThread::~VlWorkerThread() {
    shutdown();
    // The thread should exit; join it.
#ifdef VL_USE_PTHREADS
    pthread_join(m_pthread, nullptr);
#else
    m_cthread.join();
#endif
}

static void shutdownTask(void*, bool) {  // LCOV_EXCL_LINE
    // Deliberately empty, we use the address of this function as a magic number
}

void VlWorkerThread::shutdown() { addTask(shutdownTask, nullptr); }

void VlWorkerThread::wait() {
    // Enqueue a task that sets this flag. Execution is in-order so this ensures completion.
    std::atomic<bool> flag{false};
    addTask([](void* flagp, bool) { static_cast<std::atomic<bool>*>(flagp)->store(true); }, &flag);
    // Spin wait
    for (unsigned i = 0; i < VL_LOCK_SPINS; ++i) {
        if (flag.load()) return;
        VL_CPU_RELAX();
    }
    // Yield wait
    while (!flag.load()) std::this_thread::yield();
}

void VlWorkerThread::main() {
    // Initialize thread_locals
    Verilated::threadContextp(m_contextp);
    // One work item
    ExecRec work;
    // Wait for the first task without spinning, in case the thread is never actually used.
    dequeWork</* SpinWait: */ false>(&work);
    // Loop until shutdown task is received
    while (VL_UNLIKELY(work.m_fnp != shutdownTask)) {
        work.m_fnp(work.m_selfp, work.m_evenCycle);
        // Wait for next task with spinning.
        dequeWork</* SpinWait: */ true>(&work);
    }
}

void* VlWorkerThread::start(void* argp) {
    reinterpret_cast<VlWorkerThread*>(argp)->main();
    return nullptr;
}

//=============================================================================
// VlThreadPool

VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {
    for (unsigned i = 0; i < nThreads; ++i) {
        m_workers.push_back(new VlWorkerThread{contextp});
        m_unassignedWorkers.push(i);
    }
    m_numaStatus = numaAssign();
}

VlThreadPool::~VlThreadPool() {
    // Each ~WorkerThread will wait for its thread to exit.
    for (auto& i : m_workers) delete i;
}

std::string VlThreadPool::numaAssign() {
#if defined(__linux) || defined(CPU_ZERO) || defined(VL_CPPCHECK)  // Linux-like pthreads
    // Get number of processor available to the current process
    const unsigned num_proc = VlOs::getProcessAvailableParallelism();
    if (!num_proc) return "Can't determine number of available threads";
    // If fewer than hardware threads in the host, user presumably set affinity
    if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";

    // Make a reasonable processor affinity selection
    const int num_threads = static_cast<int>(m_workers.size());
    if (num_threads < 2) return "too few threads";
    if (static_cast<unsigned>(num_threads) >= num_proc - 1) return "too many threads";

    // Read CPU info.
    // Uncertain if any modern system has gaps in the processor id (Solaris
    // did), but just in case use vectors instead of processor number math.
    //
    // Currently ignoring socket number "physical id".
    // If processor numbers are sequential on sockets, algorithm works out ok.
    // If processor numbers are strided on sockets, algorithm also works out ok.
    std::ifstream is{"/proc/cpuinfo"};
    if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";

    std::vector<int> unassigned_processors;  // Processors to assign in sorted order
    std::map<int, int> processor_core;
    std::multimap<int, int> core_processors;
    std::set<int> cores;
    {
        int processor = -1;
        while (!is.eof()) {
            std::string line;
            std::getline(is, line);
            std::string::size_type pos = line.find(":");
            int number = -1;
            if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);
            if (line.compare(0, std::strlen("processor"), "processor") == 0) {
                processor = number;
            } else if (line.compare(0, std::strlen("core id"), "core id") == 0) {
                const int core = number;
                // std::cout << "p" << processor << " socket " << socket << " c" << core <<
                // std::endl;
                cores.emplace(core);
                processor_core[processor] = core;
                core_processors.emplace(core, processor);
                unassigned_processors.push_back(processor);
            }
        }
    }

    // Start scheduling on the current CPU + 1.
    // This will help to land on the same socket as current CPU, and also
    // help make sure that different processes have different masks (when
    // num_threads is not a common-factor of the processor count).
    std::sort(unassigned_processors.begin(), unassigned_processors.end());
    {
        const int on_cpu = sched_getcpu();  // TODO: this is a system call. Not exactly cheap.
        bool hit = false;
        std::vector<int> new_front;
        std::vector<int> new_back;
        for (const int processor : unassigned_processors) {
            if (hit) {
                new_front.push_back(processor);
            } else {
                new_back.push_back(processor);
            }
            if (processor == on_cpu) hit = true;
        }
        unassigned_processors = new_front;
        unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),
                                     new_back.end());
    }

    // If less threads than cores, we can schedule per-core
    const bool core_per_thread = num_threads <= cores.size();

    // Compute core mapping
    std::multimap<int, int> thread_processors;
    {
        std::set<int> assigned_processors;
        int thread = 0;
        for (const int processor : unassigned_processors) {
            // Find free processor, the current thread can use that
            if (assigned_processors.find(processor) != assigned_processors.end()) continue;
            assigned_processors.emplace(processor);
            thread_processors.emplace(thread, processor);
            if (core_per_thread) {
                // Also include all other processors same core,
                // so that another thread doesn't land on different processor in same core
                const int core = processor_core[processor];
                const auto bounds = core_processors.equal_range(core);
                for (auto it{bounds.first}; it != bounds.second; ++it) {
                    if (assigned_processors.find(it->second) != assigned_processors.end())
                        continue;
                    if (it->second == processor) continue;
                    thread_processors.emplace(thread, it->second);
                    assigned_processors.emplace(it->second);
                }
            }
            // Prepare for next loop
            thread = (thread + 1) % num_threads;
        }
    }

    // Set affinity
    std::string status = "assigned ";
    for (int thread = 0; thread < num_threads; ++thread) {
        cpu_set_t cpuset;
        CPU_ZERO(&cpuset);

        const auto bounds = thread_processors.equal_range(thread);
        for (auto it{bounds.first}; it != bounds.second; ++it) {
            if (it != bounds.first) status += ',';
            status += std::to_string(it->second);
            CPU_SET(it->second, &cpuset);
        }
        status += ";";

        const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),
                                              sizeof(cpu_set_t), &cpuset);
        if (rc != 0) return "%Warning: pthread_setaffinity_np failed";
    }
    // std::cout << "Status: " << status << std::endl;
    return status;
#else
    return "non-supported host OS";
#endif
}
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`// -- mode: C++; c-file-style: "cc-mode" --`
			`//=============================================================================`
			`//`
Commentary: Cleanup all include/* header comments. 2021-03-20 22:46:00 +01:00			`// Code available from: https://verilator.org`
			`//`
Copyright year update. 2025-01-01 14:30:25 +01:00			`// Copyright 2012-2025 by Wilson Snyder. This program is free software; you can`
Add SPDX license identifiers. No functional change. 2020-03-21 16:24:24 +01:00			`// redistribute it and/or modify it under the terms of either the GNU`
			`// Lesser General Public License Version 3 or the Perl Artistic License`
			`// Version 2.0.`
			`// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`//`
			`//=============================================================================`
			`///`
			`/// \file`
Commentary: Cleanup all include/* header comments. 2021-03-20 22:46:00 +01:00			`/// \brief Verilated thread pool implementation code`
			`///`
			`/// This file must be compiled and linked against all Verilated objects`
			`/// that use --threads.`
			`///`
			`/// Use "verilator --threads" to add this to the Makefile for the linker.`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`///`
			`//=============================================================================`

			`#include "verilatedos.h"`
Update clang-format config and apply - Regroup and sort #include directives (like we used to, but automatic) - Set AlwaysBreakTemplateDeclarations to true 2022-08-05 11:56:57 +02:00
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`#include "verilated_threads.h"`
Internals: Cleanup and standardize include order. No functional change intended. 2018-10-14 19:43:24 +02:00
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`#include <cstdio>`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00			`#include <fstream>`
			`#include <iostream>`
Internals: Add cpplint control file and related cleanups 2022-01-09 22:49:38 +01:00			`#include <memory>`
			`#include <string>`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00
Add missing FreeBSD include (#6027) (#6028) Fixes #6027 2025-05-21 01:46:07 +02:00			`#ifdef __FreeBSD__`
Apply 'make format' 2025-05-21 01:46:57 +02:00			`#include <pthread_np.h>`
Add missing FreeBSD include (#6027) (#6028) Fixes #6027 2025-05-21 01:46:07 +02:00			`#endif`

Convert VPI to singleton, part of (#2660). 2021-03-05 01:23:40 +01:00			`//=============================================================================`
			`// Globals`

			`// Internal note: Globals may multi-construct, see verilated.cpp top.`

Deprecate 'vluint64_t' and similar types (#3255). 2022-03-27 21:27:40 +02:00			`std::atomic<uint64_t> VlMTaskVertex::s_yields;`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00
			`//=============================================================================`
			`// VlMTaskVertex`

Deprecate 'vluint64_t' and similar types (#3255). 2022-03-27 21:27:40 +02:00			`VlMTaskVertex::VlMTaskVertex(uint32_t upstreamDepCount)`
C++11: Use member declaration initalizations. No functional change intended. 2020-08-16 15:55:36 +02:00			`: m_upstreamDepsDone{0}`
			`, m_upstreamDepCount{upstreamDepCount} {`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`assert(atomic_is_lock_free(&m_upstreamDepsDone));`
			`}`

			`//=============================================================================`
			`// VlWorkerThread`

Move thread pool and execution profiler into the context. (#3477) Fixes #3454 2022-07-12 12:41:15 +02:00			`VlWorkerThread::VlWorkerThread(VerilatedContext* contextp)`
Internals: Some clang-tidy cleanups. No functional change intended. 2021-07-25 19:38:27 +02:00			`: m_ready_size{0}`
Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`, m_contextp{contextp} {`
			`#ifdef VL_USE_PTHREADS`
			`// Init attributes`
			`pthread_attr_t attr;`
			`pthread_attr_init(&attr);`
			`// Attempt to use the same stack size as the current (main) thread if possible`
			`const size_t stacksize = pthread_get_stacksize_np(pthread_self());`
			`if (!stacksize \|\| pthread_attr_setstacksize(&attr, stacksize)) {`
			`// Fall back on default atributes if failed to get/set stack size`
			`pthread_attr_destroy(&attr);`
			`pthread_attr_init(&attr);`
			`}`
			`// Create thread`
			`if (pthread_create(&m_pthread, &attr, &VlWorkerThread::start, this)) {`
			`std::cerr << "pthread_create failed" << std::endl;`
			`std::abort();`
			`}`
			`// Destroy attributes`
			`pthread_attr_destroy(&attr);`
			`#else`
			`m_cthread = std::thread(start, this);`
			`#endif`
			`}`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00
			`VlWorkerThread::~VlWorkerThread() {`
Improve worker thread shutdown. Always ensure worker thread task queue is drained before shutting down. 2022-06-27 15:16:20 +02:00			`shutdown();`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`// The thread should exit; join it.`
Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`#ifdef VL_USE_PTHREADS`
			`pthread_join(m_pthread, nullptr);`
			`#else`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`m_cthread.join();`
Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`#endif`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`}`

Internals: Add some internal coverage exclusions etc. No functional change. 2022-10-03 16:57:37 +02:00			`static void shutdownTask(void*, bool) { // LCOV_EXCL_LINE`
Improve worker thread shutdown. Always ensure worker thread task queue is drained before shutting down. 2022-06-27 15:16:20 +02:00			`// Deliberately empty, we use the address of this function as a magic number`
			`}`

Move thread pool and execution profiler into the context. (#3477) Fixes #3454 2022-07-12 12:41:15 +02:00			`void VlWorkerThread::shutdown() { addTask(shutdownTask, nullptr); }`

			`void VlWorkerThread::wait() {`
			`// Enqueue a task that sets this flag. Execution is in-order so this ensures completion.`
			`std::atomic<bool> flag{false};`
			`addTask([](void* flagp, bool) { static_cast<std::atomic<bool>*>(flagp)->store(true); }, &flag);`
			`// Spin wait`
			`for (unsigned i = 0; i < VL_LOCK_SPINS; ++i) {`
			`if (flag.load()) return;`
			`VL_CPU_RELAX();`
			`}`
			`// Yield wait`
			`while (!flag.load()) std::this_thread::yield();`
			`}`

Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`void VlWorkerThread::main() {`
			`// Initialize thread_locals`
			`Verilated::threadContextp(m_contextp);`
			`// One work item`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`ExecRec work;`
Move thread pool and execution profiler into the context. (#3477) Fixes #3454 2022-07-12 12:41:15 +02:00			`// Wait for the first task without spinning, in case the thread is never actually used.`
			`dequeWork</* SpinWait: */ false>(&work);`
Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`// Loop until shutdown task is received`
			`while (VL_UNLIKELY(work.m_fnp != shutdownTask)) {`
Improve worker thread shutdown. Always ensure worker thread task queue is drained before shutting down. 2022-06-27 15:16:20 +02:00			`work.m_fnp(work.m_selfp, work.m_evenCycle);`
Move thread pool and execution profiler into the context. (#3477) Fixes #3454 2022-07-12 12:41:15 +02:00			`// Wait for next task with spinning.`
			`dequeWork</* SpinWait: */ true>(&work);`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`}`
			`}`

Set runtime worker thread stack sizes on macOS (#6721) The default stack size of secondary thread on macOS is 512k, which is too small even to run some of the tests. Unfortunately changing the thread size must happen via `pthred_create` attributes, which are not available via the c++ threading APIs. Use pthreads directly on macOS, and set the worker thread sizes to the same as the main thread stack. 2025-11-23 02:13:46 +01:00			`void* VlWorkerThread::start(void* argp) {`
			`reinterpret_cast<VlWorkerThread*>(argp)->main();`
			`return nullptr;`
Add simulation context (VerilatedContext) (#2660). (#2813) Add simulation context (VerilatedContext) to allow multiple fully independent models to be in the same process. Please see the updated examples. Add context->time() and context->timeInc() API calls, to set simulation time. These now are recommended in place of the legacy sc_time_stamp(). 2021-03-07 17:01:54 +01:00			`}`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00
			`//=============================================================================`
			`// VlThreadPool`

Move thread pool and execution profiler into the context. (#3477) Fixes #3454 2022-07-12 12:41:15 +02:00			`VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) {`
Support multi-thread hierarchical simulation (#2583) (#5871) 2025-03-24 23:39:29 +01:00			`for (unsigned i = 0; i < nThreads; ++i) {`
			`m_workers.push_back(new VlWorkerThread{contextp});`
			`m_unassignedWorkers.push(i);`
			`}`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00			`m_numaStatus = numaAssign();`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`}`

			`VlThreadPool::~VlThreadPool() {`
clang-tidy cleanups. No functional change intended. 2020-11-11 03:40:14 +01:00			`// Each ~WorkerThread will wait for its thread to exit.`
			`for (auto& i : m_workers) delete i;`
MAJOR: Add multithreaded model generation. 2018-07-23 02:54:28 +02:00			`}`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00
			`std::string VlThreadPool::numaAssign() {`
Internals: cppcheck cleanups. No functional change. 2025-06-28 18:29:41 +02:00			`#if defined(__linux) \|\| defined(CPU_ZERO) \|\| defined(VL_CPPCHECK) // Linux-like pthreads`
Change default thread pool sizes to respect processor affinity (#6604) Instead of using the number of processors in the host, use the number of processors available to the process, respecting cpu affinity assignments. Without pthreads, fall back and use the number of processors in the host as before. This is now applied everywhere so runing `nuamctl -C 0-3 verilator` or `numactl -C 0-3 Vsim` should behave as if the host has 4 cores (e.g. like in CI jobs) 2025-10-28 19:10:40 +01:00			`// Get number of processor available to the current process`
			`const unsigned num_proc = VlOs::getProcessAvailableParallelism();`
			`if (!num_proc) return "Can't determine number of available threads";`
			`// If fewer than hardware threads in the host, user presumably set affinity`
			`if (num_proc < std::thread::hardware_concurrency()) return "processor affinity already set";`

			`// Make a reasonable processor affinity selection`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00			`const int num_threads = static_cast<int>(m_workers.size());`
			`if (num_threads < 2) return "too few threads";`
VlThreadPool: Improve too many threads check logic As the thread pool is initialized for `m_threads - 1` workers, when `m_threads` equals `the number of processors`, the `num_threads` will be equal to `the number of processors - 1`, which is not needed to set affinity. This also improves backward compatibility. As mentioned in issue #6826, when the new context API is not used, the default thread pool is created with the number of workers equal to `the number of processors - 1`. In this case, when using `--threads n` with `n` less than the number of processors, the thread pool is still created with the `number of processors - 1` workers, so we will even set wrong affinity, and let some SMTs on the same core be used for emulator model threads, which degrades performance. With this fix, the affinity will not be set in this case. Signed-off-by: Yangyu Chen <cyy@cyyself.name> 2025-12-19 09:50:01 +01:00			`if (static_cast<unsigned>(num_threads) >= num_proc - 1) return "too many threads";`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00
			`// Read CPU info.`
			`// Uncertain if any modern system has gaps in the processor id (Solaris`
			`// did), but just in case use vectors instead of processor number math.`
			`//`
			`// Currently ignoring socket number "physical id".`
			`// If processor numbers are sequential on sockets, algorithm works out ok.`
			`// If processor numbers are strided on sockets, algorithm also works out ok.`
			`std::ifstream is{"/proc/cpuinfo"};`
			`if (VL_UNLIKELY(!is)) return "%Warning: no /proc/cpuinfo";`

			`std::vector<int> unassigned_processors; // Processors to assign in sorted order`
			`std::map<int, int> processor_core;`
			`std::multimap<int, int> core_processors;`
			`std::set<int> cores;`
Internals: cppcheck cleanups. No functional change. 2025-06-28 18:29:41 +02:00			`{`
			`int processor = -1;`
			`while (!is.eof()) {`
			`std::string line;`
			`std::getline(is, line);`
Fix processor parsing static position (#6598) 2025-10-28 01:44:55 +01:00			`std::string::size_type pos = line.find(":");`
Internals: cppcheck cleanups. No functional change. 2025-06-28 18:29:41 +02:00			`int number = -1;`
			`if (pos != std::string::npos) number = atoi(line.c_str() + pos + 1);`
			`if (line.compare(0, std::strlen("processor"), "processor") == 0) {`
			`processor = number;`
			`} else if (line.compare(0, std::strlen("core id"), "core id") == 0) {`
			`const int core = number;`
			`// std::cout << "p" << processor << " socket " << socket << " c" << core <<`
			`// std::endl;`
			`cores.emplace(core);`
			`processor_core[processor] = core;`
			`core_processors.emplace(core, processor);`
			`unassigned_processors.push_back(processor);`
			`}`
Add numactl-like automatic assignment of processor affinity (#5911) 2025-04-02 14:27:23 +02:00			`}`
			`}`

			`// Start scheduling on the current CPU + 1.`
			`// This will help to land on the same socket as current CPU, and also`
			`// help make sure that different processes have different masks (when`
			`// num_threads is not a common-factor of the processor count).`
			`std::sort(unassigned_processors.begin(), unassigned_processors.end());`
			`{`
			`const int on_cpu = sched_getcpu(); // TODO: this is a system call. Not exactly cheap.`
			`bool hit = false;`
			`std::vector<int> new_front;`
			`std::vector<int> new_back;`
			`for (const int processor : unassigned_processors) {`
			`if (hit) {`
			`new_front.push_back(processor);`
			`} else {`
			`new_back.push_back(processor);`
			`}`
			`if (processor == on_cpu) hit = true;`
			`}`
			`unassigned_processors = new_front;`
			`unassigned_processors.insert(unassigned_processors.end(), new_back.begin(),`
			`new_back.end());`
			`}`

			`// If less threads than cores, we can schedule per-core`
			`const bool core_per_thread = num_threads <= cores.size();`

			`// Compute core mapping`
			`std::multimap<int, int> thread_processors;`
			`{`
			`std::set<int> assigned_processors;`
			`int thread = 0;`
			`for (const int processor : unassigned_processors) {`
			`// Find free processor, the current thread can use that`
			`if (assigned_processors.find(processor) != assigned_processors.end()) continue;`
			`assigned_processors.emplace(processor);`
			`thread_processors.emplace(thread, processor);`
			`if (core_per_thread) {`
			`// Also include all other processors same core,`
			`// so that another thread doesn't land on different processor in same core`
			`const int core = processor_core[processor];`
			`const auto bounds = core_processors.equal_range(core);`
			`for (auto it{bounds.first}; it != bounds.second; ++it) {`
			`if (assigned_processors.find(it->second) != assigned_processors.end())`
			`continue;`
			`if (it->second == processor) continue;`
			`thread_processors.emplace(thread, it->second);`
			`assigned_processors.emplace(it->second);`
			`}`
			`}`
			`// Prepare for next loop`
			`thread = (thread + 1) % num_threads;`
			`}`
			`}`

			`// Set affinity`
			`std::string status = "assigned ";`
			`for (int thread = 0; thread < num_threads; ++thread) {`
			`cpu_set_t cpuset;`
			`CPU_ZERO(&cpuset);`

			`const auto bounds = thread_processors.equal_range(thread);`
			`for (auto it{bounds.first}; it != bounds.second; ++it) {`
			`if (it != bounds.first) status += ',';`
			`status += std::to_string(it->second);`
			`CPU_SET(it->second, &cpuset);`
			`}`
			`status += ";";`

			`const int rc = pthread_setaffinity_np(m_workers[thread]->m_cthread.native_handle(),`
			`sizeof(cpu_set_t), &cpuset);`
			`if (rc != 0) return "%Warning: pthread_setaffinity_np failed";`
			`}`
			`// std::cout << "Status: " << status << std::endl;`
			`return status;`
			`#else`
			`return "non-supported host OS";`
			`#endif`
			`}`