From 0ed58a42175f45a1bc2aefd244143aae591f530e Mon Sep 17 00:00:00 2001 From: Patrick Stewart Date: Mon, 7 Oct 2019 19:27:31 -0400 Subject: [PATCH] Fix multithreaded yield behavior when no work. Signed-off-by: Wilson Snyder --- Changes | 2 + include/verilated.h | 6 +- include/verilated_threads.cpp | 38 ++--------- include/verilated_threads.h | 119 +++++++++++----------------------- 4 files changed, 52 insertions(+), 113 deletions(-) diff --git a/Changes b/Changes index ec6d381d5..7b0c95540 100644 --- a/Changes +++ b/Changes @@ -12,6 +12,8 @@ The contributors that suggested a given feature are shown in []. Thanks! **** Increase case duplicate/incomplete to 16 bit tables, bug1545. [Yossi Nivin] +**** Fix multithreaded yield behavior when no work. [Patrick Stewart] + * Verilator 4.020 2019-10-06 diff --git a/include/verilated.h b/include/verilated.h index 0d95f1fd0..d1a5fe8ef 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -141,7 +141,7 @@ class VL_CAPABILITY("mutex") VerilatedMutex { bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); } }; -/// Lock guard for mutex (ala std::lock_guard), wrapped to allow -fthread_safety checks +/// Lock guard for mutex (ala std::unique_lock), wrapped to allow -fthread_safety checks class VL_SCOPED_CAPABILITY VerilatedLockGuard { VL_UNCOPYABLE(VerilatedLockGuard); private: @@ -154,6 +154,8 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard { ~VerilatedLockGuard() VL_RELEASE() { m_mutexr.unlock(); } + void lock() VL_ACQUIRE(mutexr) { m_mutexr.lock(); } + void unlock() VL_RELEASE() { m_mutexr.unlock(); } }; #else // !VL_THREADED @@ -171,6 +173,8 @@ class VerilatedLockGuard { public: explicit VerilatedLockGuard(VerilatedMutex&) {} ~VerilatedLockGuard() {} + void lock() {} + void unlock() {} }; #endif // VL_THREADED diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index 92539640e..d9ce2ad7c 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -24,7 +24,7 @@ #include -std::atomic VlNotification::s_yields; +std::atomic VlMTaskVertex::s_yields; VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL; @@ -41,7 +41,8 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount) // VlWorkerThread VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling) - : m_poolp(poolp) + : m_ready_size(0) + , m_poolp(poolp) , m_profiling(profiling) , m_exiting(false) // Must init this last -- after setting up fields that it might read: @@ -49,12 +50,7 @@ VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling) VlWorkerThread::~VlWorkerThread() { m_exiting.store(true, std::memory_order_release); - { - VerilatedLockGuard lk(m_mutex); - if (sleeping()) { - wakeUp(); - } - } + wakeUp(); // The thread should exit; join it. m_cthread.join(); } @@ -64,38 +60,18 @@ void VlWorkerThread::workerLoop() { m_poolp->setupProfilingClientThread(); } - VlNotification alarm; ExecRec work; work.m_fnp = NULL; while (1) { - bool sleep = false; - if (VL_UNLIKELY(!work.m_fnp)) { - // Look for work - VerilatedLockGuard lk(m_mutex); - if (VL_LIKELY(!m_ready.empty())) { - dequeWork(&work); - } else { - // No work available, prepare to sleep. Pass alarm/work - // into m_sleepAlarm so wakeUp will tall this function. - // - // Must modify m_sleepAlarm in the same critical section as - // the check for ready work, otherwise we could race with - // another thread enqueueing work and never be awoken. - m_sleepAlarm.first = &alarm; - m_sleepAlarm.second = &work; - sleep = true; - } + if (VL_LIKELY(!work.m_fnp)) { + dequeWork(&work); } // Do this here, not above, to avoid a race with the destructor. if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire))) break; - if (VL_UNLIKELY(sleep)) { - alarm.waitForNotification(); // ZZZzzzzz - alarm.reset(); - } if (VL_LIKELY(work.m_fnp)) { work.m_fnp(work.m_evenCycle, work.m_sym); work.m_fnp = NULL; @@ -194,7 +170,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) { fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n", Verilated::profThreadsWindow()); fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n", - VlNotification::yields()); + VlMTaskVertex::yields()); vluint32_t thread_id = 0; for (ProfileSet::iterator pit = m_allProfiles.begin(); diff --git a/include/verilated_threads.h b/include/verilated_threads.h index 6bd311e32..f704e7b5a 100644 --- a/include/verilated_threads.h +++ b/include/verilated_threads.h @@ -25,6 +25,7 @@ #include "verilatedos.h" #include "verilated.h" // for VerilatedMutex and clang annotations +#include #include #include #if defined(__linux) @@ -39,71 +40,12 @@ // as a void* here. typedef void* VlThrSymTab; -class VlNotification { - // MEMBERS - std::atomic m_notified; // Notification pending - static std::atomic s_yields; // Statistics - -public: - // CONSTRUCTORS - VlNotification() - : m_notified(false) { - assert(atomic_is_lock_free(&m_notified)); - } - ~VlNotification() {} - - // METHODS - static vluint64_t yields() { return s_yields; } - static void yieldThread() { - ++s_yields; // Statistics - std::this_thread::yield(); - } - - // Block until notify() has occurred, then return. - // If notify() has already occurred, return immediately. - // - // This is logically const: the object will remain in notified state - // after WaitForNotification() returns, so you could notify more than - // one thread of the same event. - inline void waitForNotification() { - unsigned ct = 0; - while (VL_UNLIKELY(!notified())) { - VL_CPU_RELAX(); - ++ct; - if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) { - ct = 0; - yieldThread(); - } - } - } - - // The 'inline' keyword here means nothing to the compiler, it's - // implicit on methods defined within the class body anyway. - // - // 'inline' is attached the this method, and others in this file, - // to remind humans that some routines in this file are called many - // times per cycle in threaded mode. Such routines should be - // inlinable; that's why they're declared in the .h and not the .cpp. - inline bool notified() { - return m_notified.load(std::memory_order_acquire); - } - // Set notified state. If state is already notified, - // it remains so. - inline void notify() { - m_notified.store(true, std::memory_order_release); - } - // Reset the state to un-notified state, which is also the - // state of a new Notification object. - inline void reset() { - m_notified.store(false, std::memory_order_relaxed); - } -}; - typedef void (*VlExecFnp)(bool, VlThrSymTab); /// Track dependencies for a single MTask. class VlMTaskVertex { // MEMBERS + static std::atomic s_yields; // Statistics // On even cycles, _upstreamDepsDone increases as upstream // dependencies complete. When it reaches _upstreamDepCount, @@ -133,6 +75,12 @@ public: explicit VlMTaskVertex(vluint32_t upstreamDepCount); ~VlMTaskVertex() {} + static vluint64_t yields() { return s_yields; } + static void yieldThread() { + ++s_yields; // Statistics + std::this_thread::yield(); + } + // Upstream mtasks must call this when they complete. // Returns true when the current MTaskVertex becomes ready to execute, // false while it's still waiting on more dependencies. @@ -160,7 +108,7 @@ public: ++ct; if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) { ct = 0; - VlNotification::yieldThread(); + yieldThread(); } } } @@ -238,19 +186,19 @@ private: // MEMBERS VerilatedMutex m_mutex; + std::condition_variable_any m_cv; + // Only notify the condition_variable if the worker is waiting + bool m_waiting VL_GUARDED_BY(m_mutex); // Why a vector? We expect the pending list to be very short, typically // 0 or 1 or 2, so popping from the front shouldn't be // expensive. Revisit if we ever have longer queues... std::vector m_ready VL_GUARDED_BY(m_mutex); + // Store the size atomically, so we can spin wait + std::atomic m_ready_size; VlThreadPool* m_poolp; // Our associated thread pool - // If values stored are non-NULL, the thread is asleep pending new - // work. If the thread is not asleep, both parts of m_sleepAlarm must - // be NULL. - std::pair m_sleepAlarm VL_GUARDED_BY(m_mutex); - bool m_profiling; // Is profiling enabled? std::atomic m_exiting; // Worker thread should exit std::thread m_cthread; // Underlying C++ thread record @@ -263,29 +211,38 @@ public: ~VlWorkerThread(); // METHODS - inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) { + inline void dequeWork(ExecRec* workp) { + // Spin for a while, waiting for new data + for (int i = 0; i < VL_LOCK_SPINS; ++i) { + if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) { + break; + } + VL_CPU_RELAX(); + } + VerilatedLockGuard lk(m_mutex); + while (m_ready.empty()) { + m_waiting = true; + m_cv.wait(lk); + } + m_waiting = false; // As noted above this is inefficient if our ready list is ever // long (but it shouldn't be) *workp = m_ready.front(); m_ready.erase(m_ready.begin()); + m_ready_size.fetch_sub(1, std::memory_order_relaxed); } - inline void wakeUp() VL_REQUIRES(m_mutex) { - VlNotification* notifyp = m_sleepAlarm.first; - m_sleepAlarm.first = NULL; // NULL+NULL means wake - m_sleepAlarm.second = NULL; - notifyp->notify(); - } - inline bool sleeping() VL_REQUIRES(m_mutex) { - return (m_sleepAlarm.first != NULL); + inline void wakeUp() { + addTask(nullptr, false, nullptr); } inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) { - VerilatedLockGuard lk(m_mutex); - m_ready.emplace_back(fnp, evenCycle, sym); - if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work - // Awaken thread - dequeWork(m_sleepAlarm.second); - wakeUp(); + bool notify; + { + VerilatedLockGuard lk(m_mutex); + m_ready.emplace_back(fnp, evenCycle, sym); + m_ready_size.fetch_add(1, std::memory_order_relaxed); + notify = m_waiting; } + if (notify) m_cv.notify_one(); } void workerLoop(); static void startWorker(VlWorkerThread* workerp);