From 0ed58a42175f45a1bc2aefd244143aae591f530e Mon Sep 17 00:00:00 2001
From: Patrick Stewart <patrick@rfcreations.com>
Date: Mon, 7 Oct 2019 19:27:31 -0400
Subject: [PATCH] Fix multithreaded yield behavior when no work.

Signed-off-by: Wilson Snyder <wsnyder@wsnyder.org>
---
 Changes                       |   2 +
 include/verilated.h           |   6 +-
 include/verilated_threads.cpp |  38 ++---------
 include/verilated_threads.h   | 119 +++++++++++-----------------------
 4 files changed, 52 insertions(+), 113 deletions(-)
diff --git a/Changes b/Changes
index ec6d381d5..7b0c95540 100644
--- a/Changes
+++ b/Changes
@@ -12,6 +12,8 @@ The contributors that suggested a given feature are shown in []. Thanks!
 
 ****  Increase case duplicate/incomplete to 16 bit tables, bug1545. [Yossi Nivin]
 
+****  Fix multithreaded yield behavior when no work.  [Patrick Stewart]
+
 
 * Verilator 4.020 2019-10-06
 
diff --git a/include/verilated.h b/include/verilated.h
index 0d95f1fd0..d1a5fe8ef 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -141,7 +141,7 @@ class VL_CAPABILITY("mutex") VerilatedMutex {
     bool try_lock() VL_TRY_ACQUIRE(true) { return m_mutex.try_lock(); }
 };
 
-/// Lock guard for mutex (ala std::lock_guard), wrapped to allow -fthread_safety checks
+/// Lock guard for mutex (ala std::unique_lock), wrapped to allow -fthread_safety checks
 class VL_SCOPED_CAPABILITY VerilatedLockGuard {
     VL_UNCOPYABLE(VerilatedLockGuard);
   private:
@@ -154,6 +154,8 @@ class VL_SCOPED_CAPABILITY VerilatedLockGuard {
     ~VerilatedLockGuard() VL_RELEASE() {
         m_mutexr.unlock();
     }
+    void lock() VL_ACQUIRE(mutexr) { m_mutexr.lock(); }
+    void unlock() VL_RELEASE() { m_mutexr.unlock(); }
 };
 
 #else  // !VL_THREADED
@@ -171,6 +173,8 @@ class VerilatedLockGuard {
 public:
     explicit VerilatedLockGuard(VerilatedMutex&) {}
     ~VerilatedLockGuard() {}
+    void lock() {}
+    void unlock() {}
 };
 
 #endif  // VL_THREADED
diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp
index 92539640e..d9ce2ad7c 100644
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@@ -24,7 +24,7 @@
 
 #include <cstdio>
 
-std::atomic<vluint64_t> VlNotification::s_yields;
+std::atomic<vluint64_t> VlMTaskVertex::s_yields;
 
 VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
 
@@ -41,7 +41,8 @@ VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
 // VlWorkerThread
 
 VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
-    : m_poolp(poolp)
+    : m_ready_size(0)
+    , m_poolp(poolp)
     , m_profiling(profiling)
     , m_exiting(false)
       // Must init this last -- after setting up fields that it might read:
@@ -49,12 +50,7 @@ VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
 
 VlWorkerThread::~VlWorkerThread() {
     m_exiting.store(true, std::memory_order_release);
-    {
-        VerilatedLockGuard lk(m_mutex);
-        if (sleeping()) {
-            wakeUp();
-        }
-    }
+    wakeUp();
     // The thread should exit; join it.
     m_cthread.join();
 }
@@ -64,38 +60,18 @@ void VlWorkerThread::workerLoop() {
         m_poolp->setupProfilingClientThread();
     }
 
-    VlNotification alarm;
     ExecRec work;
     work.m_fnp = NULL;
 
     while (1) {
-        bool sleep = false;
-        if (VL_UNLIKELY(!work.m_fnp)) {
-            // Look for work
-            VerilatedLockGuard lk(m_mutex);
-            if (VL_LIKELY(!m_ready.empty())) {
-                dequeWork(&work);
-            } else {
-                // No work available, prepare to sleep. Pass alarm/work
-                // into m_sleepAlarm so wakeUp will tall this function.
-                //
-                // Must modify m_sleepAlarm in the same critical section as
-                // the check for ready work, otherwise we could race with
-                // another thread enqueueing work and never be awoken.
-                m_sleepAlarm.first = &alarm;
-                m_sleepAlarm.second = &work;
-                sleep = true;
-            }
+        if (VL_LIKELY(!work.m_fnp)) {
+            dequeWork(&work);
         }
 
         // Do this here, not above, to avoid a race with the destructor.
         if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
             break;
 
-        if (VL_UNLIKELY(sleep)) {
-            alarm.waitForNotification();  // ZZZzzzzz
-            alarm.reset();
-        }
         if (VL_LIKELY(work.m_fnp)) {
             work.m_fnp(work.m_evenCycle, work.m_sym);
             work.m_fnp = NULL;
@@ -194,7 +170,7 @@ void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
     fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
             Verilated::profThreadsWindow());
     fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
-            VlNotification::yields());
+            VlMTaskVertex::yields());
 
     vluint32_t thread_id = 0;
     for (ProfileSet::iterator pit = m_allProfiles.begin();
diff --git a/include/verilated_threads.h b/include/verilated_threads.h
index 6bd311e32..f704e7b5a 100644
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@@ -25,6 +25,7 @@
 #include "verilatedos.h"
 #include "verilated.h"  // for VerilatedMutex and clang annotations
 
+#include <condition_variable>
 #include <set>
 #include <vector>
 #if defined(__linux)
@@ -39,71 +40,12 @@
 // as a void* here.
 typedef void* VlThrSymTab;
 
-class VlNotification {
-    // MEMBERS
-    std::atomic<bool> m_notified;  // Notification pending
-    static std::atomic<vluint64_t> s_yields;  // Statistics
-
-public:
-    // CONSTRUCTORS
-    VlNotification()
-        : m_notified(false) {
-        assert(atomic_is_lock_free(&m_notified));
-    }
-    ~VlNotification() {}
-
-    // METHODS
-    static vluint64_t yields() { return s_yields; }
-    static void yieldThread() {
-        ++s_yields;  // Statistics
-        std::this_thread::yield();
-    }
-
-    // Block until notify() has occurred, then return.
-    // If notify() has already occurred, return immediately.
-    //
-    // This is logically const: the object will remain in notified state
-    // after WaitForNotification() returns, so you could notify more than
-    // one thread of the same event.
-    inline void waitForNotification() {
-        unsigned ct = 0;
-        while (VL_UNLIKELY(!notified())) {
-            VL_CPU_RELAX();
-            ++ct;
-            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
-                ct = 0;
-                yieldThread();
-            }
-        }
-    }
-
-    // The 'inline' keyword here means nothing to the compiler, it's
-    // implicit on methods defined within the class body anyway.
-    //
-    // 'inline' is attached the this method, and others in this file,
-    // to remind humans that some routines in this file are called many
-    // times per cycle in threaded mode. Such routines should be
-    // inlinable; that's why they're declared in the .h and not the .cpp.
-    inline bool notified() {
-        return m_notified.load(std::memory_order_acquire);
-    }
-    // Set notified state. If state is already notified,
-    // it remains so.
-    inline void notify() {
-        m_notified.store(true, std::memory_order_release);
-    }
-    // Reset the state to un-notified state, which is also the
-    // state of a new Notification object.
-    inline void reset() {
-        m_notified.store(false, std::memory_order_relaxed);
-    }
-};
-
 typedef void (*VlExecFnp)(bool, VlThrSymTab);
 
 /// Track dependencies for a single MTask.
 class VlMTaskVertex {
     // MEMBERS
+    static std::atomic<vluint64_t> s_yields;  // Statistics
 
     // On even cycles, _upstreamDepsDone increases as upstream
     // dependencies complete. When it reaches _upstreamDepCount,
@@ -133,6 +75,12 @@ public:
     explicit VlMTaskVertex(vluint32_t upstreamDepCount);
     ~VlMTaskVertex() {}
 
+    static vluint64_t yields() { return s_yields; }
+    static void yieldThread() {
+        ++s_yields;  // Statistics
+        std::this_thread::yield();
+    }
+
     // Upstream mtasks must call this when they complete.
     // Returns true when the current MTaskVertex becomes ready to execute,
     // false while it's still waiting on more dependencies.
@@ -160,7 +108,7 @@ public:
             ++ct;
             if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
                 ct = 0;
-                VlNotification::yieldThread();
+                yieldThread();
             }
         }
     }
@@ -238,19 +186,19 @@ private:
 
     // MEMBERS
     VerilatedMutex m_mutex;
+    std::condition_variable_any m_cv;
+    // Only notify the condition_variable if the worker is waiting
+    bool m_waiting VL_GUARDED_BY(m_mutex);
 
     // Why a vector? We expect the pending list to be very short, typically
     // 0 or 1 or 2, so popping from the front shouldn't be
     // expensive. Revisit if we ever have longer queues...
     std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
+    // Store the size atomically, so we can spin wait
+    std::atomic<size_t> m_ready_size;
 
     VlThreadPool* m_poolp;  // Our associated thread pool
 
-    // If values stored are non-NULL, the thread is asleep pending new
-    // work. If the thread is not asleep, both parts of m_sleepAlarm must
-    // be NULL.
-    std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
-
     bool m_profiling;  // Is profiling enabled?
     std::atomic<bool> m_exiting;  // Worker thread should exit
     std::thread m_cthread;  // Underlying C++ thread record
@@ -263,29 +211,38 @@ public:
     ~VlWorkerThread();
 
     // METHODS
-    inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
+    inline void dequeWork(ExecRec* workp) {
+        // Spin for a while, waiting for new data
+        for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+            if (VL_LIKELY(m_ready_size.load(std::memory_order_relaxed))) {
+                break;
+            }
+            VL_CPU_RELAX();
+        }
+        VerilatedLockGuard lk(m_mutex);
+        while (m_ready.empty()) {
+            m_waiting = true;
+            m_cv.wait(lk);
+        }
+        m_waiting = false;
         // As noted above this is inefficient if our ready list is ever
         // long (but it shouldn't be)
         *workp = m_ready.front();
         m_ready.erase(m_ready.begin());
+        m_ready_size.fetch_sub(1, std::memory_order_relaxed);
     }
-    inline void wakeUp() VL_REQUIRES(m_mutex) {
-        VlNotification* notifyp = m_sleepAlarm.first;
-        m_sleepAlarm.first = NULL;  // NULL+NULL means wake
-        m_sleepAlarm.second = NULL;
-        notifyp->notify();
-    }
-    inline bool sleeping() VL_REQUIRES(m_mutex) {
-        return (m_sleepAlarm.first != NULL);
+    inline void wakeUp() {
+        addTask(nullptr, false, nullptr);
     }
     inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
-        VerilatedLockGuard lk(m_mutex);
-        m_ready.emplace_back(fnp, evenCycle, sym);
-        if (VL_LIKELY(sleeping())) {  // Generally queue is waiting for work
-            // Awaken thread
-            dequeWork(m_sleepAlarm.second);
-            wakeUp();
+        bool notify;
+        {
+            VerilatedLockGuard lk(m_mutex);
+            m_ready.emplace_back(fnp, evenCycle, sym);
+            m_ready_size.fetch_add(1, std::memory_order_relaxed);
+            notify = m_waiting;
         }
+        if (notify) m_cv.notify_one();
     }
     void workerLoop();
     static void startWorker(VlWorkerThread* workerp);