From 0722f47539113382db67d94f46e28cb8b22c2a37 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Fri, 27 May 2022 16:57:51 +0100
Subject: [PATCH 01/19] Improve V3MergeCond by reordering statements (#3125)

V3MergeCond merges consecutive conditional `_ = cond ? _ : _` and
`if (cond) ...` statements. This patch adds an analysis and ordering
phase that moves statements with identical conditions closer to each
other, in order to enable more merging opportunities. This in turn
eliminates a lot of repeated conditionals which reduced dynamic branch
count and branch misprediction rate. Observed 6.5% improvement on
multi-threaded large designs, at the cost of less than 2% increase in
Verilation speed.
---
 Changes                               |   1 +
 src/V3AstUserAllocator.h              |   2 +-
 src/V3MergeCond.cpp                   | 666 ++++++++++++++++++++------
 test_regress/t/t_merge_cond.pl        |   4 +-
 test_regress/t/t_merge_cond_blowup.pl |  34 ++
 test_regress/t/t_merge_cond_blowup.v  |  55 +++
 6 files changed, 602 insertions(+), 160 deletions(-)
 create mode 100755 test_regress/t/t_merge_cond_blowup.pl
 create mode 100644 test_regress/t/t_merge_cond_blowup.v

diff --git a/Changes b/Changes
index d6067565d..a0eb2b1d1 100644
--- a/Changes
+++ b/Changes
@@ -17,6 +17,7 @@ Verilator 4.223 devel
 * Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
 * Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
 * Support non-ANSI interface port declarations (#3439). [Geza Lore, Shunyao CAD]
+* Improve conditional merging optimization (#3125). [Geza Lore, Shunyao CAD]
 * Fix hang with large case statement optimization (#3405). [Mike Urbach]
 * Fix 'with' operator with type casting (#3387). [xiak95]
 * Fix incorrect conditional merging (#3409). [Raynard Qiao]
diff --git a/src/V3AstUserAllocator.h b/src/V3AstUserAllocator.h
index d230f0829..8d63ad5a9 100644
--- a/src/V3AstUserAllocator.h
+++ b/src/V3AstUserAllocator.h
@@ -106,7 +106,7 @@ public:
     }
 
     // Get a reference to the user data
-    T_Data& operator()(const T_Node* nodep) {
+    T_Data& operator()(const T_Node* nodep) const {
         T_Data* const userp = getUserp(nodep);
         UASSERT_OBJ(userp, nodep, "Missing User data on const AstNode");
         return *userp;
diff --git a/src/V3MergeCond.cpp b/src/V3MergeCond.cpp
index 673326f27..3881c48df 100644
--- a/src/V3MergeCond.cpp
+++ b/src/V3MergeCond.cpp
@@ -42,6 +42,34 @@
 //
 //  Also merges consecutive AstNodeIf statements with the same condition.
 //
+//  Because this optimization has notable performance impact, we go further
+//  and perform code motion to try to move mergeable conditionals next to each
+//  other, which in turn enable us to merge more conditionals. To do this, we
+//  perform an analysis pass, followed by an optimization pass on the whole
+//  AstCFunc we are optimizing.
+//
+//  The analysis pass gathers, for each statement in the tree, the information
+//  relevant for determining whether two statements can be swapped, and some
+//  other additional information that is useful during optimization.
+//
+//  The optimization pass tries to move conditionals near each other, first by
+//  trying to move a conditional node backwards in the list, so it becomes the
+//  direct successor of another earlier conditional with the same condition.
+//  If this is not possible due to variable interference, then we additionally
+//  try to pull earlier conditionals with the same condition closer forward to
+//  be the immediate predecessor of the conditional node. We limit maximum
+//  distance a node can travel to an empirically chosen but otherwise arbitrary
+//  constant. This limits worst case complexity to be O(n) rather than O(n^2).
+//  The worst case complexity manifests when N/2 conditionals, all with unique
+//  conditions are succeeded by N/2 conditionals with the same unique
+//  conditions, such that each unique condition is used by exactly 2
+//  conditionals. In this case N/2 all nodes need to travel approx N/2 distance.
+//  Limiting the distance bounds the latter, hence limiting complexity.
+//
+//  Once the analysis and optimization passes have been applied to the whole
+//  function, any merged conditionals will then undergo the same analysis,
+//  optimization, and merging again in their individual branches.
+//
 //*************************************************************************
 
 #include "config_build.h"
@@ -51,71 +79,364 @@
 #include "V3MergeCond.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
+#include "V3AstUserAllocator.h"
+#include "V3Hasher.h"
+#include "V3DupFinder.h"
+
+#include <queue>
+#include <set>
+
+namespace {
 
 //######################################################################
+// Utilities
 
-enum class Mergeable {
-    YES,  // Tree can be merged
-    NO_COND_ASSIGN,  // Tree cannot be merged because it contains an assignment to a condition
-    NO_IMPURE  // Tree cannot be merged because it contains an impure node
+// This function extracts the Cond node from the RHS of an assignment,
+// if there is one and it is in a supported position, which are:
+// - RHS is the Cond
+// - RHS is And(Const, Cond). This And is inserted often by V3Clean.
+AstNodeCond* extractCondFromRhs(AstNode* rhsp) {
+    if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
+        return condp;
+    } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
+        if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
+            if (VN_IS(andp->lhsp(), Const)) return condp;
+        }
+    }
+    return nullptr;
+}
+
+// Predicate to check if two sets are disjoint. This is stable, as we only need
+// to determine if the sets contain a shared element, which is a boolean
+// property. It is also efficient as we use sorted sets, and therefore can
+// enumerate elements in order (what the ordering is, is unimportant), meaning
+// the worst case complexity is O(size of smaller set).
+bool areDisjoint(const std::set<const AstVar*>& a, const std::set<const AstVar*>& b) {
+    if (a.empty() || b.empty()) return true;
+    const auto endA = a.end();
+    const auto endB = b.end();
+    auto itA = a.begin();
+    auto itB = b.begin();
+    while (true) {
+        if (*itA == *itB) return false;
+        if (std::less<const AstVar*>{}(*itA, *itB)) {
+            itA = std::lower_bound(++itA, endA, *itB);
+            if (itA == endA) return true;
+        } else {
+            itB = std::lower_bound(++itB, endB, *itA);
+            if (itB == endB) return true;
+        }
+    }
+}
+
+//######################################################################
+// Structure containing information required for code motion/merging
+
+struct StmtProperties {
+    AstNode* m_condp = nullptr;  // The condition expression, if a conditional node
+    std::set<const AstVar*> m_rdVars;  // Variables read by this statement
+    std::set<const AstVar*> m_wrVars;  // Variables writen by this statement
+    bool m_isFence = false;  // Nothing should move across this statement, nor should it be merged
+    AstNodeStmt* m_prevWithSameCondp = nullptr;  // Previous node in same list, with same condition
+    bool writesConditionVar() const {
+        // This relies on MarkVarsVisitor having been called on the condition node
+        for (const AstVar* const varp : m_wrVars) {
+            if (varp->user1()) return true;
+        }
+        return false;
+    }
 };
 
-class CheckMergeableVisitor final : public VNVisitor {
-private:
-    // STATE
-    bool m_condAssign = false;  // Does this tree contain an assignment to a condition variable??
-    bool m_impure = false;  // Does this tree contain an impure node?
+// We store the statement properties in user3 via AstUser3Allocator
+using StmtPropertiesAllocator = AstUser3Allocator<AstNodeStmt, StmtProperties>;
 
-    // METHODS
-    VL_DEBUG_FUNC;  // Declare debug()
+//######################################################################
+// Code motion analysis and implementation
 
-    // VISITORS
-    virtual void visit(AstNode* nodep) override {
-        if (m_impure) return;
-        // Clear if node is impure
-        if (!nodep->isPure()) {
-            UINFO(9, "Not mergeable due to impure node" << nodep << endl);
-            m_impure = true;
-            return;
+// Pure analysis visitor that build the StmtProperties for each statement in the given
+// AstNode list (following AstNode::nextp())
+class CodeMotionAnalysisVisitor final : public VNVisitor {
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNode::user4       -> Used by V3Hasher
+    // AstNode::user5       -> AstNode*: Set on a condition node, points to the last conditional
+    //                         with that condition so far encountered in the same AstNode list
+
+    VNUser5InUse m_user5InUse;
+
+    StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+    V3Hasher m_hasher;  // Used by V3DupFinder
+    // Stack of a V3DupFinder used for finding identical condition expressions within one
+    // statement list.
+    std::vector<V3DupFinder> m_stack;
+    StmtProperties* m_propsp = nullptr;  // StmtProperties structure of current AstNodeStmt
+
+    // Extract condition expression from a megeable conditional statement, if any
+    static AstNode* extractCondition(const AstNodeStmt* nodep) {
+        AstNode* conditionp = nullptr;
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            if (AstNodeCond* const conditionalp = extractCondFromRhs(assignp->rhsp())) {
+                conditionp = conditionalp->condp();
+            }
+        } else if (const AstNodeIf* const ifp = VN_CAST(nodep, NodeIf)) {
+            conditionp = ifp->condp();
         }
+        while (AstCCast* const castp = VN_CAST(conditionp, CCast)) conditionp = castp->lhsp();
+        return conditionp;
+    }
+
+    void analyzeStmt(AstNodeStmt* nodep, bool tryCondMatch) {
+        VL_RESTORER(m_propsp);
+        // Keep hold of props of enclosing statement
+        StmtProperties* const outerPropsp = m_propsp;
+        // Grab the props of this statement
+        m_propsp = &m_stmtProperties(nodep);
+
+        // Extract condition from statement
+        if (AstNode* const condp = extractCondition(nodep)) {
+            // Remember condition node. We always need this as it is used in the later
+            // traversal.
+            m_propsp->m_condp = condp;
+            // If this is a conditional statement, try to find an earlier one with the same
+            // condition in the same list (unless we have been told not to bother because we know
+            // this node is in a singleton list).
+            if (tryCondMatch) {
+                // Grab the duplicate finder of this list
+                V3DupFinder& dupFinder = m_stack.back();
+                // Find a duplicate condition
+                const V3DupFinder::iterator& dit = dupFinder.findDuplicate(condp);
+                if (dit == dupFinder.end()) {
+                    // First time seeing this condition in the current list
+                    dupFinder.insert(condp);
+                    // Remember last statement with this condition (which is this statement)
+                    condp->user5p(nodep);
+                } else {
+                    // Seen a conditional with the same condition earlier in the current list
+                    AstNode* const firstp = dit->second;
+                    // Add to properties for easy retrieval during optimization
+                    m_propsp->m_prevWithSameCondp = static_cast<AstNodeStmt*>(firstp->user5p());
+                    // Remember last statement with this condition (which is this statement)
+                    firstp->user5p(nodep);
+                }
+            }
+        }
+
+        // Analyse this statement
+        analyzeNode(nodep);
+
+        // If there is an enclosing statement, propagate properties upwards
+        if (outerPropsp) {
+            // Add all rd/wr vars to outer statement
+            outerPropsp->m_rdVars.insert(m_propsp->m_rdVars.cbegin(), m_propsp->m_rdVars.cend());
+            outerPropsp->m_wrVars.insert(m_propsp->m_wrVars.cbegin(), m_propsp->m_wrVars.cend());
+            // If this statement is impure, the enclosing statement is also impure
+            if (m_propsp->m_isFence) outerPropsp->m_isFence = true;
+        }
+    }
+
+    void analyzeVarRef(AstVarRef* nodep) {
+        const VAccess access = nodep->access();
+        AstVar* const varp = nodep->varp();
+        // Gather read and written variables
+        if (access.isReadOrRW()) m_propsp->m_rdVars.insert(varp);
+        if (access.isWriteOrRW()) m_propsp->m_wrVars.insert(varp);
+    }
+
+    void analyzeNode(AstNode* nodep) {
+        // If an impure node under a statement, mark that statement as impure
+        if (m_propsp && !nodep->isPure()) m_propsp->m_isFence = true;
+        // Analyze children
         iterateChildrenConst(nodep);
     }
-    virtual void visit(AstVarRef* nodep) override {
-        if (m_impure || m_condAssign) return;
-        // Clear if it's an LValue referencing a marked variable
-        if (nodep->access().isWriteOrRW() && nodep->varp()->user1()) {
-            UINFO(9, "Not mergeable due assignment to condition" << nodep << endl);
-            m_condAssign = true;
+
+    // VISITORS
+    void visit(AstNode* nodep) override {
+        // Push a new stack entry at the start of a list, but only if the list is not a
+        // single element (this saves a lot of allocations in expressions)
+        bool singletonListStart = false;
+        if (nodep->backp()->nextp() != nodep) {  // If at head of list
+            singletonListStart = nodep->nextp() == nullptr;
+            if (!singletonListStart) m_stack.emplace_back(m_hasher);
         }
+
+        // Analyse node
+        if (AstNodeStmt* const stmtp = VN_CAST(nodep, NodeStmt)) {
+            analyzeStmt(stmtp, /*tryCondMatch:*/ !singletonListStart);
+        } else if (AstVarRef* const vrefp = VN_CAST(nodep, VarRef)) {
+            analyzeVarRef(vrefp);
+        } else {
+            analyzeNode(nodep);
+        }
+
+        // Pop the stack at the end of a list
+        if (!singletonListStart && !nodep->nextp()) m_stack.pop_back();
+    }
+
+    // CONSTRUCTOR
+    CodeMotionAnalysisVisitor(AstNode* nodep, StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        iterateAndNextConstNull(nodep);
     }
 
 public:
-    CheckMergeableVisitor() = default;
-
-    // Return false if this node should not be merged at all because:
-    // - It contains an impure expression
-    // - It contains an LValue referencing the condition
-    Mergeable operator()(const AstNode* node) {
-        m_condAssign = false;
-        m_impure = false;
-        iterateChildrenConst(const_cast<AstNode*>(node));
-        if (m_impure) {  // Impure is stronger than cond assign
-            return Mergeable::NO_IMPURE;
-        } else if (m_condAssign) {
-            return Mergeable::NO_COND_ASSIGN;
-        } else {
-            return Mergeable::YES;
-        }
+    // Analyse the statement list starting at nodep, filling in stmtProperties.
+    static void analyze(AstNode* nodep, StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionAnalysisVisitor{nodep, stmtProperties};
     }
 };
 
+class CodeMotionOptimizeVisitor final : public VNVisitor {
+    // Do not move a node more than this many statements.
+    // This bounds complexity at O(N), rather than O(N^2).
+    static constexpr unsigned MAX_DISTANCE = 500;
+
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNodeStmt::user4   -> bool: Already processed this node
+
+    VNUser4InUse m_user4InUse;
+
+    const StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+
+    // Predicate that checks if the order of two statements can be swapped
+    bool areSwappable(const AstNodeStmt* ap, const AstNodeStmt* bp) const {
+        const StmtProperties& aProps = m_stmtProperties(ap);
+        const StmtProperties& bProps = m_stmtProperties(bp);
+        // Don't move across fences
+        if (aProps.m_isFence) return false;
+        if (bProps.m_isFence) return false;
+        // If either statement writes a variable that the other reads, they are not swappable
+        if (!areDisjoint(aProps.m_rdVars, bProps.m_wrVars)) return false;
+        if (!areDisjoint(bProps.m_rdVars, aProps.m_wrVars)) return false;
+        // If they both write to the same variable, they are not swappable
+        if (!areDisjoint(aProps.m_wrVars, bProps.m_wrVars)) return false;
+        // Otherwise good to go
+        return true;
+    }
+
+    // VISITORS
+    void visit(AstNodeStmt* nodep) override {
+        // Process only on first encounter
+        if (nodep->user4SetOnce()) return;
+        // First re-order children
+        iterateChildren(nodep);
+        // Grab hold of previous node with same condition
+        AstNodeStmt* prevp = m_stmtProperties(nodep).m_prevWithSameCondp;
+        // If no previous node with same condition, we are done
+        if (!prevp) return;
+#ifdef VL_DEBUG
+        {  // Sanity check, only in debug build, otherwise expensive
+            const AstNode* currp = prevp;
+            while (currp && currp != nodep) currp = currp->nextp();
+            UASSERT_OBJ(currp, nodep, "Predecessor not in same list as " << currp);
+        }
+#endif
+        // Otherwise try to move this node backwards, as close as we can to the previous node
+        // with the same condition
+        if (AstNodeStmt* predp = VN_CAST(nodep->backp(), NodeStmt)) {
+            // 'predp' is the newly computed predecessor node of 'nodep', which is initially
+            // (without movement) the 'backp' of the node.
+            for (unsigned i = MAX_DISTANCE; i; --i) {
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) break;
+                // Don't move past a non-statement (e.g.: AstVar), or end of list
+                AstNodeStmt* const backp = VN_CAST(predp->backp(), NodeStmt);
+                if (!backp) break;
+                // Don't swap statements if doing so would change program semantics
+                if (!areSwappable(predp, nodep)) break;
+                // Otherwise move 'nodep' back
+                predp = backp;
+            }
+
+            // If we decided that 'nodep' should be moved back
+            if (nodep->backp() != predp) {
+                // Move the current node to directly follow the computed predecessor
+                nodep->unlinkFrBack();
+                predp->addNextHere(nodep);
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) return;
+            }
+        }
+        // If we reach here, it means we were unable to move the current node all the way back
+        // such that it immediately follows the previous statement with the same condition. Now
+        // try to move all previous statements with the same condition forward, in the hope of
+        // compacting the list further.
+        for (AstNodeStmt* currp = nodep; prevp;
+             currp = prevp, prevp = m_stmtProperties(currp).m_prevWithSameCondp) {
+            // Move prevp (previous statement with same condition) towards currp
+            if (AstNodeStmt* succp = VN_CAST(prevp->nextp(), NodeStmt)) {
+                // 'succp' is the newly computed successor node of 'prevp', which is initially
+                // (without movement) the 'nextp' of the node.
+                for (unsigned i = MAX_DISTANCE; --i;) {
+                    // If the successor of the previous statement with same condition is the
+                    // target node, we are done with this predecessor
+                    if (succp == currp) break;
+                    // Don't move past a non-statement (e.g.: AstVar), or end of list
+                    AstNodeStmt* const nextp = VN_CAST(succp->nextp(), NodeStmt);
+                    if (!nextp) break;
+                    // Don't swap statements if doing so would change program semantics
+                    if (!areSwappable(prevp, succp)) break;
+                    // Otherwise move further forward
+                    succp = nextp;
+                }
+
+                // If we decided that 'prevp' should be moved forward
+                if (prevp->nextp() != succp) {
+                    // Move the current node to directly before the computed successor
+                    prevp->unlinkFrBack();
+                    succp->addHereThisAsNext(prevp);
+                }
+            }
+        }
+    }
+
+    void visit(AstNode* nodep) override {}  // Ignore all non-statements
+
+    // CONSTRUCTOR
+    CodeMotionOptimizeVisitor(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        // We assert the given node is at the head of the list otherwise we might move a node
+        // before the given node. This is easy to fix in the above iteration with a check on a
+        // boundary node we should not move past, if we ever need to do so.
+        // Note: we will do iterateAndNextNull which requires nodep->backp() != nullptr anyway
+        UASSERT_OBJ(nodep->backp()->nextp() != nodep, nodep, "Must be at head of list");
+        // Optimize the list
+        iterateAndNextNull(nodep);
+    }
+
+public:
+    // Given an AstNode list (held via AstNode::nextp()), move conditional statements as close
+    // together as possible
+    static AstNode* optimize(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionOptimizeVisitor{nodep, stmtProperties};
+        // It is possible for the head of the list to be moved later such that it is no longer
+        // in head position. If so, rewind the list and return the new head.
+        while (nodep->backp()->nextp() == nodep) nodep = nodep->backp();
+        return nodep;
+    }
+};
+
+//######################################################################
+// Conditional merging
+
 class MergeCondVisitor final : public VNVisitor {
 private:
     // NODE STATE
-    // AstVar::user1        -> Flag set for variables referenced by m_mgCondp
-    // AstNode::user2       -> Flag marking node as included in merge because cheap to duplicate
-    const VNUser1InUse m_user1InUse;
-    const VNUser2InUse m_user2InUse;
+    // AstVar::user1        -> bool: Set for variables referenced by m_mgCondp
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user2       -> bool: Marking node as included in merge because cheap to
+    //                         duplicate
+    //                         (Only below MergeCondVisitor::process).
+    // AstNodeStmt::user3   -> StmtProperties
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user4       -> See CodeMotionAnalysisVisitor/CodeMotionOptimizeVisitor
+    // AstNode::user5       -> See CodeMotionAnalysisVisitor
 
     // STATE
     VDouble0 m_statMerges;  // Statistic tracking
@@ -128,24 +449,84 @@ private:
     const AstNode* m_mgNextp = nullptr;  // Next node in list being examined
     uint32_t m_listLenght = 0;  // Length of current list
 
-    CheckMergeableVisitor m_checkMergeable;  // Sub visitor for encapsulation & speed
+    std::queue<AstNode*>* m_workQueuep = nullptr;  // Node lists (via AstNode::nextp()) to merge
+    // Statement properties for code motion and merging
+    StmtPropertiesAllocator* m_stmtPropertiesp = nullptr;
 
     // METHODS
     VL_DEBUG_FUNC;  // Declare debug()
 
-    // This function extracts the Cond node from the RHS, if there is one and
-    // it is in a supported position, which are:
-    // - RHS is the Cond
-    // - RHS is And(Const, Cond). This And is inserted often by V3Clean.
-    static AstNodeCond* extractCond(AstNode* rhsp) {
-        if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
-            return condp;
-        } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
-            if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
-                if (VN_IS(andp->lhsp(), Const)) return condp;
-            }
+    // Function that processes a whole sub-tree
+    void process(AstNode* nodep) {
+        // Set up work queue
+        std::queue<AstNode*> workQueue;
+        m_workQueuep = &workQueue;
+        m_workQueuep->push(nodep);
+
+        do {
+            // Set up user* for this iteration
+            const VNUser1InUse user1InUse;
+            const VNUser2InUse user2InUse;
+            const VNUser3InUse user3InUse;
+            // Statement properties only preserved for this iteration,
+            // then memory is released immediately.
+            StmtPropertiesAllocator stmtProperties;
+            m_stmtPropertiesp = &stmtProperties;
+
+            // Pop off current work item
+            AstNode* currp = m_workQueuep->front();
+            m_workQueuep->pop();
+
+            // Analyse sub-tree list for code motion
+            CodeMotionAnalysisVisitor::analyze(currp, stmtProperties);
+            // Perform the code motion within the whole sub-tree list
+            currp = CodeMotionOptimizeVisitor::optimize(currp, stmtProperties);
+
+            // Merge conditionals in the whole sub-tree list (this might create new work items)
+            iterateAndNextNull(currp);
+
+            // Close pending merge, if there is one at the end of the whole sub-tree list
+            if (m_mgFirstp) mergeEnd();
+        } while (!m_workQueuep->empty());
+    }
+
+    // Skip past AstArraySel and AstWordSel with const index
+    static AstNode* skipConstSels(AstNode* nodep) {
+        while (const AstArraySel* const aselp = VN_CAST(nodep, ArraySel)) {
+            // ArraySel index is not constant, so might be expensive
+            if (!VN_IS(aselp->bitp(), Const)) return nodep;
+            nodep = aselp->fromp();
         }
-        return nullptr;
+        while (const AstWordSel* const wselp = VN_CAST(nodep, WordSel)) {
+            // WordSel index is not constant, so might be expensive
+            if (!VN_IS(wselp->bitp(), Const)) return nodep;
+            nodep = wselp->fromp();
+        }
+        return nodep;
+    }
+
+    // Check if this node is cheap enough that duplicating it in two branches of an
+    // AstIf is not likely to cause a performance degradation.
+    static bool isCheapNode(AstNode* nodep) {
+        // Comments are cheap
+        if (VN_IS(nodep, Comment)) return true;
+        // So are some assignments
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            // Check LHS
+            AstNode* const lhsp = skipConstSels(assignp->lhsp());
+            // LHS is not a VarRef, so might be expensive
+            if (!VN_IS(lhsp, VarRef)) return false;
+
+            // Check RHS
+            AstNode* const rhsp = skipConstSels(assignp->rhsp());
+            // RHS is not a VarRef or Constant so might be expensive
+            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
+
+            // Otherwise it is a cheap assignment
+            return true;
+        }
+        // Others are not
+        return false;
     }
 
     // Predicate to check if an expression yields only 0 or 1 (i.e.: a 1-bit value)
@@ -196,23 +577,21 @@ private:
     static AstNode* maskLsb(AstNode* nodep) {
         if (yieldsOneOrZero(nodep)) return nodep;
         // Otherwise apply masking
-        AstNode* const maskp = new AstConst(nodep->fileline(), AstConst::BitTrue());
+        AstNode* const maskp = new AstConst{nodep->fileline(), AstConst::BitTrue()};
         // Mask on left, as conventional
-        return new AstAnd(nodep->fileline(), maskp, nodep);
+        return new AstAnd{nodep->fileline(), maskp, nodep};
     }
 
-    // Fold the RHS expression assuming the given condition state. Unlink bits
-    // from the RHS which is only used once, and can be reused. What remains
-    // of the RHS is expected to be deleted by the caller.
+    // Fold the RHS expression of an assignment assuming the given condition state.
+    // Unlink bits from the RHS which is only used once, and can be reused (is an unomdified
+    // sub-tree). What remains of the RHS is expected to be deleted by the caller.
     AstNode* foldAndUnlink(AstNode* rhsp, bool condTrue) {
         if (rhsp->sameTree(m_mgCondp)) {
-            return new AstConst(rhsp->fileline(), AstConst::BitTrue{}, condTrue);
-        } else if (const AstNodeCond* const condp = extractCond(rhsp)) {
+            return new AstConst{rhsp->fileline(), AstConst::BitTrue{}, condTrue};
+        } else if (const AstNodeCond* const condp = extractCondFromRhs(rhsp)) {
             AstNode* const resp
                 = condTrue ? condp->expr1p()->unlinkFrBack() : condp->expr2p()->unlinkFrBack();
-            if (condp == rhsp) {  //
-                return resp;
-            }
+            if (condp == rhsp) return resp;
             if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
                 UASSERT_OBJ(andp->rhsp() == condp, rhsp, "Should not try to fold this");
                 return new AstAnd{andp->fileline(), andp->lhsp()->cloneTree(false), resp};
@@ -227,17 +606,18 @@ private:
                 return condTrue ? maskLsb(andp->lhsp()->unlinkFrBack())
                                 : new AstConst{rhsp->fileline(), AstConst::BitFalse()};
             }
-        } else if (VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef) || VN_IS(rhsp, Const)) {
+        } else if (VN_IS(rhsp, ArraySel) || VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef)
+                   || VN_IS(rhsp, Const)) {
             return rhsp->cloneTree(false);
         }
-        rhsp->dumpTree("Don't know how to fold expression: ");
-        rhsp->v3fatalSrc("Don't know how to fold expression");
+        // LCOV_EXCL_START
+        if (debug()) rhsp->dumpTree("Don't know how to fold expression: ");
+        rhsp->v3fatalSrc("Should not try to fold this during conditional merging");
+        // LCOV_EXCL_STOP
     }
 
-    void mergeEnd(int lineno) {
-        UASSERT(m_mgFirstp, "mergeEnd without list " << lineno);
-        // We might want to recursively merge an AstIf. We stash it in this variable.
-        const AstNodeIf* recursivep = nullptr;
+    void mergeEnd() {
+        UASSERT(m_mgFirstp, "mergeEnd without list");
         // Drop leading cheap nodes. These were only added in the hope of finding
         // an earlier reduced form, but we failed to do so.
         while (m_mgFirstp->user2() && m_mgFirstp != m_mgLastp) {
@@ -254,8 +634,11 @@ private:
             m_mgLastp = m_mgLastp->backp();
             --m_listLenght;
             UASSERT_OBJ(m_mgLastp && m_mgLastp->nextp() == nextp, m_mgFirstp,
-                        "Cheap assignment should not be at the front of the list");
+                        "Cheap statement should not be at the front of the list");
         }
+        // If the list contains a single AstNodeIf, we will want to merge its branches.
+        // If so, keep hold of the AstNodeIf in this variable.
+        AstNodeIf* recursivep = nullptr;
         // Merge if list is longer than one node
         if (m_mgFirstp != m_mgLastp) {
             UINFO(6, "MergeCond - First: " << m_mgFirstp << " Last: " << m_mgLastp << endl);
@@ -266,7 +649,7 @@ private:
             // and we also need to keep track of it for comparisons later.
             m_mgCondp = m_mgCondp->cloneTree(false);
             // Create equivalent 'if' statement and insert it before the first node
-            AstIf* const resultp = new AstIf(m_mgCondp->fileline(), m_mgCondp);
+            AstIf* const resultp = new AstIf{m_mgCondp->fileline(), m_mgCondp};
             m_mgFirstp->addHereThisAsNext(resultp);
             // Unzip the list and insert under branches
             AstNode* nextp = m_mgFirstp;
@@ -308,10 +691,12 @@ private:
                     VL_DO_DANGLING(ifp->deleteTree(), ifp);
                 }
             } while (nextp);
-            // Recursively merge the resulting AstIf
-            recursivep = resultp;
-        } else if (const AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
-            // There was nothing to merge this AstNodeIf with, but try to merge it's branches
+            // Merge the branches of the resulting AstIf after re-analysis
+            if (resultp->ifsp()) m_workQueuep->push(resultp->ifsp());
+            if (resultp->elsesp()) m_workQueuep->push(resultp->elsesp());
+        } else if (AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
+            // There was nothing to merge this AstNodeIf with, so try to merge its branches.
+            // No re-analysis is required for this, so do it directly below
             recursivep = ifp;
         }
         // Reset state
@@ -321,14 +706,13 @@ private:
         m_mgNextp = nullptr;
         AstNode::user1ClearTree();  // Clear marked variables
         AstNode::user2ClearTree();
-        // Merge recursively within the branches
+        // Merge recursively within the branches of an un-merged AstNodeIF
         if (recursivep) {
             iterateAndNextNull(recursivep->ifsp());
-            // Close list, if there is one at the end of the then branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
             iterateAndNextNull(recursivep->elsesp());
-            // Close list, if there is one at the end of the else branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
+            // Close a pending merge to ensure merge state is
+            // reset as expected at the end of this function
+            if (m_mgFirstp) mergeEnd();
         }
     }
 
@@ -351,47 +735,16 @@ private:
         return false;
     }
 
-    // Check if this node is cheap enough that duplicating it in two branches of an
-    // AstIf and is hence not likely to cause a performance degradation if doing so.
-    bool isCheapNode(AstNode* nodep) const {
-        if (VN_IS(nodep, Comment)) return true;
-        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
-            // Check LHS
-            AstNode* lhsp = assignp->lhsp();
-            while (AstWordSel* const wselp = VN_CAST(lhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                lhsp = wselp->fromp();
-            }
-            // LHS is not a VarRef, so might be expensive
-            if (!VN_IS(lhsp, VarRef)) return false;
-
-            // Check RHS
-            AstNode* rhsp = assignp->rhsp();
-            while (AstWordSel* const wselp = VN_CAST(rhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                rhsp = wselp->fromp();
-            }
-            // RHS is not a VarRef or Constant so might be expensive
-            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
-
-            // Otherwise it is a cheap assignment
-            return true;
-        }
-        return false;
-    }
-
-    bool addToList(AstNode* nodep, AstNode* condp, int line) {
+    bool addToList(AstNodeStmt* nodep, AstNode* condp) {
         // Set up head of new list if node is first in list
         if (!m_mgFirstp) {
-            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition " << line);
+            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition");
             // Mark variable references in the condition
             condp->foreach<AstVarRef>([](const AstVarRef* nodep) { nodep->varp()->user1(1); });
             // Now check again if mergeable. We need this to pick up assignments to conditions,
             // e.g.: 'c = c ? a : b' at the beginning of the list, which is in fact not mergeable
             // because it updates the condition. We simply bail on these.
-            if (m_checkMergeable(nodep) != Mergeable::YES) {
+            if ((*m_stmtPropertiesp)(nodep).writesConditionVar()) {
                 // Clear marked variables
                 AstNode::user1ClearTree();
                 // We did not add to the list
@@ -400,11 +753,13 @@ private:
             m_mgFirstp = nodep;
             m_mgCondp = condp;
             m_listLenght = 0;
-            // Add any preceding nodes to the list that would allow us to extend the merge range
-            for (;;) {
-                AstNode* const backp = m_mgFirstp->backp();
+            // Add any preceding nodes to the list that would allow us to extend the merge
+            // range
+            while (true) {
+                AstNodeStmt* const backp = VN_CAST(m_mgFirstp->backp(), NodeStmt);
                 if (!backp || backp->nextp() != m_mgFirstp) break;  // Don't move up the tree
-                if (m_checkMergeable(backp) != Mergeable::YES) break;
+                const StmtProperties& props = (*m_stmtPropertiesp)(backp);
+                if (props.m_isFence || props.writesConditionVar()) break;
                 if (isSimplifiableNode(backp)) {
                     ++m_listLenght;
                     m_mgFirstp = backp;
@@ -424,59 +779,53 @@ private:
         // Set up expected next node in list.
         m_mgNextp = nodep->nextp();
         // If last under parent, done with current list
-        if (!m_mgNextp) mergeEnd(__LINE__);
+        if (!m_mgNextp) mergeEnd();
         // We did add to the list
         return true;
     }
 
     // If this node is the next expected node and is helpful to add to the list, do so,
     // otherwise end the current merge. Return ture if added, false if ended merge.
-    bool addIfHelpfulElseEndMerge(AstNode* nodep) {
+    bool addIfHelpfulElseEndMerge(AstNodeStmt* nodep) {
         UASSERT_OBJ(m_mgFirstp, nodep, "List must be open");
         if (m_mgNextp == nodep) {
             if (isSimplifiableNode(nodep)) {
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
             } else if (isCheapNode(nodep)) {
                 nodep->user2(1);
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
             }
         }
         // Not added to list, so we are done with the current list
-        mergeEnd(__LINE__);
+        mergeEnd();
         return false;
     }
 
-    bool checkOrMakeMergeable(AstNode* nodep) {
-        const Mergeable reason = m_checkMergeable(nodep);
-        // If meregeable, we are done
-        if (reason == Mergeable::YES) return true;
-        // Node not mergeable.
-        // If no current list, then this node is just special, move on.
-        if (!m_mgFirstp) return false;
-        // Otherwise finish current list
-        mergeEnd(__LINE__);
-        // If a tree was not mergeable due to an assignment to a condition,
-        // then finishing the current list makes it mergeable again.
-        return reason == Mergeable::NO_COND_ASSIGN;
+    bool checkOrMakeMergeable(const AstNodeStmt* nodep) {
+        const StmtProperties& props = (*m_stmtPropertiesp)(nodep);
+        if (props.m_isFence) return false;  // Fence node never mergeable
+        // If the statement writes a condition variable of a pending merge,
+        // we must end the pending merge
+        if (m_mgFirstp && props.writesConditionVar()) mergeEnd();
+        return true;  // Now surely mergeable
     }
 
-    void mergeEndIfIncompatible(AstNode* nodep, AstNode* condp) {
+    void mergeEndIfIncompatible(const AstNode* nodep, const AstNode* condp) {
         if (m_mgFirstp && (m_mgNextp != nodep || !condp->sameTree(m_mgCondp))) {
             // Node in different list, or has different condition. Finish current list.
-            mergeEnd(__LINE__);
+            mergeEnd();
         }
     }
 
     // VISITORS
     virtual void visit(AstNodeAssign* nodep) override {
-        AstNode* const rhsp = nodep->rhsp();
-        if (const AstNodeCond* const condp = extractCond(rhsp)) {
+        if (AstNode* const condp = (*m_stmtPropertiesp)(nodep).m_condp) {
             // Check if mergeable
             if (!checkOrMakeMergeable(nodep)) return;
             // Close potentially incompatible pending merge
-            mergeEndIfIncompatible(nodep, condp->condp());
+            mergeEndIfIncompatible(nodep, condp);
             // Add current node
-            addToList(nodep, condp->condp(), __LINE__);
+            addToList(nodep, condp);
         } else if (m_mgFirstp) {
             addIfHelpfulElseEndMerge(nodep);
         }
@@ -493,21 +842,22 @@ private:
         // Close potentially incompatible pending merge
         mergeEndIfIncompatible(nodep, nodep->condp());
         // Add current node
-        addToList(nodep, nodep->condp(), __LINE__);
+        addToList(nodep, nodep->condp());
+    }
+
+    virtual void visit(AstNodeStmt* nodep) override {
+        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
+        iterateChildren(nodep);
+    }
+
+    virtual void visit(AstCFunc* nodep) override {
+        // Merge function body
+        if (nodep->stmtsp()) process(nodep->stmtsp());
     }
 
     // For speed, only iterate what is necessary.
     virtual void visit(AstNetlist* nodep) override { iterateAndNextNull(nodep->modulesp()); }
     virtual void visit(AstNodeModule* nodep) override { iterateAndNextNull(nodep->stmtsp()); }
-    virtual void visit(AstCFunc* nodep) override {
-        iterateChildren(nodep);
-        // Close list, if there is one at the end of the function
-        if (m_mgFirstp) mergeEnd(__LINE__);
-    }
-    virtual void visit(AstNodeStmt* nodep) override {
-        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
-        iterateChildren(nodep);
-    }
     virtual void visit(AstNode* nodep) override {}
 
 public:
@@ -520,6 +870,8 @@ public:
     }
 };
 
+}  // namespace
+
 //######################################################################
 // MergeConditionals class functions
 
diff --git a/test_regress/t/t_merge_cond.pl b/test_regress/t/t_merge_cond.pl
index 51f97242d..971a808af 100755
--- a/test_regress/t/t_merge_cond.pl
+++ b/test_regress/t/t_merge_cond.pl
@@ -21,11 +21,11 @@ execute(
 if ($Self->{vlt}) {
     # Note, with vltmt this might be split differently, so only checking vlt
     file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i,
-              10);
+              9);
     file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i,
               580);
     file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i,
-              64);
+              128);
 }
 
 ok(1);
diff --git a/test_regress/t/t_merge_cond_blowup.pl b/test_regress/t/t_merge_cond_blowup.pl
new file mode 100755
index 000000000..aa9e8e1fe
--- /dev/null
+++ b/test_regress/t/t_merge_cond_blowup.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2022 by Geza Lore. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vlt => 1);
+
+# TODO: This takes excessively long on vltmt, this should be fixed
+
+compile(
+    verilator_flags2 => ["--unroll-count 1000000000", "--output-split 0", "--stats"],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+if ($Self->{vlt}) {
+    # Note, with vltmt this might be split differently, so only checking vlt
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i,
+              500);   # V3MergeCond.cpp MAX_DISTANCE
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i,
+              1000);  # V3MergeCond.cpp MAX_DISTANCE *2
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i,
+              2);
+}
+
+ok(1);
+1;
diff --git a/test_regress/t/t_merge_cond_blowup.v b/test_regress/t/t_merge_cond_blowup.v
new file mode 100644
index 000000000..aa97f8f26
--- /dev/null
+++ b/test_regress/t/t_merge_cond_blowup.v
@@ -0,0 +1,55 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Geza Lore.
+// SPDX-License-Identifier: CC0-1.0
+
+module t (/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   localparam int N = 4096;
+
+   integer cyc = 0;
+   reg [63:0] crc= 64'h5aef0c8d_d70a4497;
+
+   always @ (posedge clk) begin
+      cyc <= cyc + 1;
+      crc <= {crc[62:0], crc[63] ^ crc[2] ^ crc[0]};
+
+      if (cyc==99) begin
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+
+   reg a [N-1:0];
+   reg b [N-1:0];
+
+   // This yields pathological complexity for the current conditional merging
+   // algorithm. Note in practice, other parts of the compiler blow up on this
+   // code far earlier than the conditional merging, but here we go anyway.
+   generate
+      genvar i;
+      for (i = 0 ; i < N ; i = i + 1) begin
+        always @(posedge clk) a[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64];
+      end
+      for (i = 0 ; i < N ; i = i + 1) begin
+        always @(posedge clk) b[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64];
+      end
+   endgenerate
+
+   always @(posedge clk) begin
+      if (cyc >= 2) begin
+        for (int i = 0 ; i < N ; i = i + 1) begin
+          if (a[i] !== b[i]) begin
+            $write("%%Error: %s:%0d: cyc=%0d i=%0d a[i]='h%x b[i]='h%x\n", `__FILE__,`__LINE__, cyc, i, a[i], b[i]);
+            $stop;
+          end
+        end
+      end
+   end
+
+endmodule

From d45caca011b70e3468289254ccbb000604e8e603 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sat, 28 May 2022 12:07:24 +0100
Subject: [PATCH 02/19] Remove legacy VCD tracing API

This has not been used by Verilator for a while, but was kept for
compatibility with some external code. Now removed.
---
 include/verilated_trace_imp.cpp           |  37 ---
 include/verilated_vcd_c.cpp               | 279 ----------------------
 include/verilated_vcd_c.h                 | 160 -------------
 test_regress/t/t_trace_c_api.cpp          |  24 --
 test_regress/t/t_trace_c_api.pl           |  30 ---
 test_regress/t/t_trace_c_api.v            |   8 -
 test_regress/t/t_trace_complex_old_api.pl |  39 ---
 7 files changed, 577 deletions(-)
 delete mode 100644 test_regress/t/t_trace_c_api.cpp
 delete mode 100755 test_regress/t/t_trace_c_api.pl
 delete mode 100644 test_regress/t/t_trace_c_api.v
 delete mode 100755 test_regress/t/t_trace_complex_old_api.pl

diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.cpp
index 7a98b7abf..dac31ddac 100644
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.cpp
@@ -725,41 +725,4 @@ static inline void cvtQDataToStr(char* dstp, QData value) {
 
 #define cvtEDataToStr cvtIDataToStr
 
-//=============================================================================
-
-#ifdef VERILATED_VCD_TEST
-
-void verilated_trace_imp_selftest() {
-#define SELF_CHECK(got, exp) \
-    do { \
-        if ((got) != (exp)) VL_FATAL_MT(__FILE__, __LINE__, "", "%Error: selftest"); \
-    } while (0)
-
-#define SELF_CHECK_TS(scale) \
-    SELF_CHECK(doubleToTimescale(timescaleToDouble(scale)), std::string{scale});
-    SELF_CHECK_TS("100s");
-    SELF_CHECK_TS("10s");
-    SELF_CHECK_TS("1s");
-    SELF_CHECK_TS("100ms");
-    SELF_CHECK_TS("10ms");
-    SELF_CHECK_TS("1ms");
-    SELF_CHECK_TS("100us");
-    SELF_CHECK_TS("10us");
-    SELF_CHECK_TS("1us");
-    SELF_CHECK_TS("100ns");
-    SELF_CHECK_TS("10ns");
-    SELF_CHECK_TS("1ns");
-    SELF_CHECK_TS("100ps");
-    SELF_CHECK_TS("10ps");
-    SELF_CHECK_TS("1ps");
-    SELF_CHECK_TS("100fs");
-    SELF_CHECK_TS("10fs");
-    SELF_CHECK_TS("1fs");
-    SELF_CHECK_TS("100as");
-    SELF_CHECK_TS("10as");
-    SELF_CHECK_TS("1as");
-}
-
-#endif
-
 #endif  // VL_CPPCHECK
diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp
index 78383befc..d2417bc3e 100644
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@@ -562,23 +562,6 @@ void VerilatedVcd::declArray(uint32_t code, const char* name, bool array, int ar
 void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int arraynum) {
     declare(code, name, "real", array, arraynum, false, false, 63, 0);
 }
-#ifdef VL_TRACE_VCD_OLD_API
-void VerilatedVcd::declTriBit(uint32_t code, const char* name, bool array, int arraynum) {
-    declare(code, name, "wire", array, arraynum, true, false, 0, 0);
-}
-void VerilatedVcd::declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                              int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                               int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                                int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-#endif  //  VL_TRACE_VCD_OLD_API
 
 //=============================================================================
 // Trace rendering prinitives
@@ -689,265 +672,3 @@ void VerilatedVcd::emitDouble(uint32_t code, double newval) {
     wp += std::strlen(wp);
     finishLine(code, wp);
 }
-
-#ifdef VL_TRACE_VCD_OLD_API
-
-void VerilatedVcd::fullBit(uint32_t code, const uint32_t newval) {
-    // Note the &1, so we don't require clean input -- makes more common no change case faster
-    *oldp(code) = newval;
-    *m_writep++ = ('0' + static_cast<char>(newval & 1));
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullBus(uint32_t code, const uint32_t newval, int bits) {
-    *oldp(code) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1L << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullQuad(uint32_t code, const uint64_t newval, int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1ULL << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint32_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 32)] & (1L << (bit & 0x1f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint64_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 64)] & (1ULL << (bit & 0x3f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = "01zz"[newval | (newtri << 1)];
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri,
-                              int bits) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1) | (((newtri >> bit) & 1) << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri,
-                               int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    (*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1ULL) | (((newtri >> bit) & 1ULL) << 1ULL)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                                int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-        oldp(code)[word * 2] = newvalp[word];
-        oldp(code)[word * 2 + 1] = newtrip[word];
-    }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        uint32_t valbit = (newvalp[(bit / 32)] >> (bit & 0x1f)) & 1;
-        uint32_t tribit = (newtrip[(bit / 32)] >> (bit & 0x1f)) & 1;
-        *m_writep++ = "01zz"[valbit | (tribit << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullDouble(uint32_t code, const double newval) {
-    // cppcheck-suppress invalidPointerCast
-    (*(reinterpret_cast<double*>(oldp(code)))) = newval;
-    // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(m_writep, m_wrChunkSize, "r%.16g", newval);
-    m_writep += std::strlen(m_writep);
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-
-#endif  // VL_TRACE_VCD_OLD_API
-
-//======================================================================
-//======================================================================
-//======================================================================
-
-#ifdef VERILATED_VCD_TEST
-#include <iostream>
-
-extern void verilated_trace_imp_selftest();
-
-uint32_t v1, v2, s1, s2[3];
-uint32_t tri96[3];
-uint32_t tri96__tri[3];
-uint64_t quad96[2];
-uint64_t tquad;
-uint64_t tquad__tri;
-uint8_t ch;
-uint64_t timestamp = 1;
-double doub = 0.0;
-float flo = 0.0f;
-
-void vcdInit(void*, VerilatedVcd* vcdp, uint32_t) {
-    vcdp->scopeEscape('.');
-    vcdp->pushNamePrefix("top.");
-    /**/ vcdp->declBus(0x2, "v1", -1, 0, 5, 1);
-    /**/ vcdp->declBus(0x3, "v2", -1, 0, 6, 1);
-    /**/ vcdp->pushNamePrefix("sub1.");
-    /***/ vcdp->declBit(0x4, "s1", -1, 0);
-    /***/ vcdp->declBit(0x5, "ch", -1, 0);
-    /**/ vcdp->popNamePrefix();
-    /**/ vcdp->pushNamePrefix("sub2.");
-    /***/ vcdp->declArray(0x6, "s2", -1, 0, 40, 3);
-    /**/ vcdp->popNamePrefix();
-    vcdp->popNamePrefix();
-    // Note need to add 3 for next code.
-    vcdp->pushNamePrefix("top2.");
-    /**/ vcdp->declBus(0x2, "t2v1", -1, 0, 4, 1);
-    /**/ vcdp->declTriBit(0x10, "io1", -1, 0);
-    /**/ vcdp->declTriBus(0x12, "io5", -1, 0, 4, 0);
-    /**/ vcdp->declTriArray(0x16, "io96", -1, 0, 95, 0);
-    /**/  // Note need to add 6 for next code.
-    /**/ vcdp->declDouble(0x1c, "doub", -1, 0);
-    /**/  // Note need to add 2 for next code.
-    /**/ vcdp->declArray(0x20, "q2", -1, 0, 95, 0);
-    /**/  // Note need to add 4 for next code.
-    /**/ vcdp->declTriQuad(0x24, "tq", -1, 0, 63, 0);
-    /**/  // Note need to add 4 for next code.
-    vcdp->popNamePrefix();
-}
-
-void vcdFull(void*, VerilatedVcd* vcdp) {
-    vcdp->fullBus(0x2, v1, 5);
-    vcdp->fullBus(0x3, v2, 7);
-    vcdp->fullBit(0x4, s1);
-    vcdp->fullBus(0x5, ch, 2);
-    vcdp->fullArray(0x6, &s2[0], 38);
-    vcdp->fullTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->fullTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->fullTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->fullDouble(0x1c, doub);
-    vcdp->fullArray(0x20, &quad96[0], 96);
-    vcdp->fullTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-void vcdChange(void*, VerilatedVcd* vcdp) {
-    vcdp->chgBus(0x2, v1, 5);
-    vcdp->chgBus(0x3, v2, 7);
-    vcdp->chgBit(0x4, s1);
-    vcdp->chgBus(0x5, ch, 2);
-    vcdp->chgArray(0x6, &s2[0], 38);
-    vcdp->chgTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->chgTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->chgTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->chgDouble(0x1c, doub);
-    vcdp->chgArray(0x20, &quad96[0], 96);
-    vcdp->chgTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-// clang-format off
-void vcdTestMain(const char* filenamep) {
-    verilated_trace_imp_selftest();
-
-    v1 = v2 = s1 = 0;
-    s2[0] = s2[1] = s2[2] = 0;
-    tri96[2] = tri96[1] = tri96[0] = 0;
-    tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;
-    quad96[1] = quad96[0] = 0;
-    ch = 0;
-    doub = 0;
-    tquad = tquad__tri = 0;
-    {
-        VerilatedVcdC* vcdp = new VerilatedVcdC;
-        vcdp->evcd(true);
-        vcdp->set_time_unit("1ms");
-        vcdp->set_time_unit(std::string{"1ms"});
-        vcdp->set_time_resolution("1ns");
-        vcdp->set_time_resolution(std::string{"1ns"});
-        vcdp->spTrace()->addInitCb(&vcdInit, 0);
-        vcdp->spTrace()->addFullCb(&vcdFull, 0);
-        vcdp->spTrace()->addChgCb(&vcdChange, 0);
-        vcdp->open(filenamep);
-        // Dumping
-        vcdp->dump(++timestamp);
-        v1 = 0xfff;
-        tri96[2] = 4; tri96[1] = 2; tri96[0] = 1;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;  // Still tri
-        quad96[1] = 0xffffffff; quad96[0] = 0;
-        doub = 1.5;
-        flo = 1.4f;
-        vcdp->dump(++timestamp);
-        v2 = 0x1;
-        s2[1] = 2;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = 0;  // enable w/o data change
-        quad96[1] = 0; quad96[0] = ~0;
-        doub = -1.66e13;
-        flo = 0.123f;
-        tquad = 0x00ff00ff00ff00ffULL;
-        tquad__tri = 0x0000fffff0000ffffULL;
-        vcdp->dump(++timestamp);
-        ch = 2;
-        tri96[2] = ~4; tri96[1] = ~2; tri96[0] = ~1;
-        doub = -3.33e-13;
-        vcdp->dump(++timestamp);
-        vcdp->dump(++timestamp);
-# ifdef VERILATED_VCD_TEST_64BIT
-        const uint64_t bytesPerDump = 15ULL;
-        for (uint64_t i = 0; i < ((1ULL << 32) / bytesPerDump); i++) {
-            v1 = i;
-            vcdp->dump(++timestamp);
-        }
-# endif
-        vcdp->close();
-        VL_DO_CLEAR(delete vcdp, vcdp = nullptr);
-    }
-}
-#endif
-// clang-format on
-
-//********************************************************************
-// ;compile-command: "v4make test_regress/t/t_trace_c_api.pl"
-//
-// Local Variables:
-// End:
diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h
index 5fbb6022c..b1485e13b 100644
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@@ -164,156 +164,6 @@ public:
     void declQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
     void declArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
     void declDouble(uint32_t code, const char* name, bool array, int arraynum);
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    void declTriBit(uint32_t code, const char* name, bool array, int arraynum);
-    void declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-
-    void fullBit(uint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
-    void fullCData(uint32_t* oldp, CData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullSData(uint32_t* oldp, SData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullIData(uint32_t* oldp, IData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullQData(uint32_t* oldp, QData newval, int bits) {
-        fullQuad(oldp - this->oldp(0), newval, bits);
-    }
-    void fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        fullArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    void fullDouble(uint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }
-
-    inline void chgBit(uint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
-    inline void chgCData(uint32_t* oldp, CData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgSData(uint32_t* oldp, SData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgIData(uint32_t* oldp, IData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgQData(uint32_t* oldp, QData newval, int bits) {
-        chgQuad(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        chgArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    inline void chgDouble(uint32_t* oldp, double newval) {
-        chgDouble(oldp - this->oldp(0), newval);
-    }
-
-    // Inside dumping routines, dump one signal, faster when not inlined
-    // due to code size reduction.
-    void fullBit(uint32_t code, const uint32_t newval);
-    void fullBus(uint32_t code, const uint32_t newval, int bits);
-    void fullQuad(uint32_t code, const uint64_t newval, int bits);
-    void fullArray(uint32_t code, const uint32_t* newvalp, int bits);
-    void fullArray(uint32_t code, const uint64_t* newvalp, int bits);
-    void fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri);
-    void fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits);
-    void fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits);
-    void fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, int bits);
-    void fullDouble(uint32_t code, const double newval);
-
-    // Inside dumping routines, dump one signal if it has changed.
-    // We do want to inline these to avoid calls when the value did not change.
-    inline void chgBit(uint32_t code, const uint32_t newval) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) fullBit(code, newval);
-    }
-    inline void chgBus(uint32_t code, const uint32_t newval, int bits) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullBus(code, newval, bits);
-            }
-        }
-    }
-    inline void chgQuad(uint32_t code, const uint64_t newval, int bits) {
-        const uint64_t diff = (*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullQuad(code, newval, bits);
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint32_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY(oldp(code)[word] ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint64_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) {
-            if (VL_UNLIKELY(*(reinterpret_cast<uint64_t*>(oldp(code + 2 * word)))
-                            ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            // Verilator 3.510 and newer provide clean input, so the below
-            // is only for back compatibility
-            if (VL_UNLIKELY(diff & 1)) {  // Change after clean?
-                fullTriBit(code, newval, newtri);
-            }
-        }
-    }
-    inline void chgTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullTriBus(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits) {
-        const uint64_t diff = (((*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval)
-                               | ((*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullTriQuad(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                            int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY((oldp(code)[word * 2] ^ newvalp[word])
-                            | (oldp(code)[word * 2 + 1] ^ newtrip[word]))) {
-                fullTriArray(code, newvalp, newtrip, bits);
-                return;
-            }
-        }
-    }
-    inline void chgDouble(uint32_t code, const double newval) {
-        // cppcheck-suppress invalidPointerCast
-        if (VL_UNLIKELY((*(reinterpret_cast<double*>(oldp(code)))) != newval)) {
-            fullDouble(code, newval);
-        }
-    }
-
-    // METHODS
-    // Old/standalone API only
-    void evcd(bool flag) { m_evcd = flag; }
-#endif  // VL_TRACE_VCD_OLD_API
 };
 
 #ifndef DOXYGEN
@@ -396,16 +246,6 @@ public:
 
     // Internal class access
     inline VerilatedVcd* spTrace() { return &m_sptrace; }
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    // Use evcd format
-    void evcd(bool flag) VL_MT_UNSAFE_ONE { m_sptrace.evcd(flag); }
-#endif
 };
 
 #endif  // guard
diff --git a/test_regress/t/t_trace_c_api.cpp b/test_regress/t/t_trace_c_api.cpp
deleted file mode 100644
index d2d3f0921..000000000
--- a/test_regress/t/t_trace_c_api.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// -*- mode: C++; c-file-style: "cc-mode" -*-
-//
-// DESCRIPTION: Verilator: Verilog Test module
-//
-// This file ONLY is placed under the Creative Commons Public Domain, for
-// any use, without warranty, 2008 by Wilson Snyder.
-// SPDX-License-Identifier: CC0-1.0
-
-#include <verilated.h>
-#include <verilated_vcd_c.h>
-
-#include VM_PREFIX_INCLUDE
-
-double sc_time_stamp() { return 0; }
-
-extern void vcdTestMain(const char* filenamep);
-
-int main(int argc, char** argv, char** env) {
-    const char* filenamep = VL_STRINGIFY(TEST_OBJ_DIR) "/simx.vcd";
-    printf("Writing %s\n", filenamep);
-    vcdTestMain(filenamep);
-    printf("*-* All Finished *-*\n");
-    return 0;
-}
diff --git a/test_regress/t/t_trace_c_api.pl b/test_regress/t/t_trace_c_api.pl
deleted file mode 100755
index 541970008..000000000
--- a/test_regress/t/t_trace_c_api.pl
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env perl
-if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
-# DESCRIPTION: Verilator: Verilog Test driver/expect definition
-#
-# Copyright 2003-2013 by Wilson Snyder. This program is free software; you
-# can redistribute it and/or modify it under the terms of either the GNU
-# Lesser General Public License Version 3 or the Perl Artistic License
-# Version 2.0.
-# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
-
-scenarios(vlt => 1);
-
-compile(
-    make_top_shell => 0,
-    make_main => 0,
-    v_flags2 => ["--trace --exe $Self->{t_dir}/t_trace_c_api.cpp",
-                 "-CFLAGS -DVERILATED_VCD_TEST",
-                 "-CFLAGS -DVL_TRACE_VCD_OLD_API"],
-    );
-
-execute(
-    check_finished => 1,
-    );
-
-# vcddiff bug crashes
-#vcd_identical("$Self->{obj_dir}/simx.vcd",
-#              $Self->{golden_filename});
-
-ok(1);
-1;
diff --git a/test_regress/t/t_trace_c_api.v b/test_regress/t/t_trace_c_api.v
deleted file mode 100644
index 7b440cb91..000000000
--- a/test_regress/t/t_trace_c_api.v
+++ /dev/null
@@ -1,8 +0,0 @@
-// DESCRIPTION: Verilator: Verilog Test module
-//
-// This file ONLY is placed under the Creative Commons Public Domain, for
-// any use, without warranty, 2013 by Wilson Snyder.
-// SPDX-License-Identifier: CC0-1.0
-
-module t;
-endmodule
diff --git a/test_regress/t/t_trace_complex_old_api.pl b/test_regress/t/t_trace_complex_old_api.pl
deleted file mode 100755
index 8136d3f79..000000000
--- a/test_regress/t/t_trace_complex_old_api.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env perl
-if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
-# DESCRIPTION: Verilator: Verilog Test driver/expect definition
-#
-# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
-# can redistribute it and/or modify it under the terms of either the GNU
-# Lesser General Public License Version 3 or the Perl Artistic License
-# Version 2.0.
-# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
-
-# Same test as t_trace_complex, but exercising the old VCD tracing API
-
-scenarios(vlt => 1);
-
-top_filename("t/t_trace_complex.v");
-golden_filename("t/t_trace_complex.out");
-
-compile(
-    verilator_flags2 => ['--cc --trace -CFLAGS -DVL_TRACE_VCD_OLD_API'],
-    );
-
-execute(
-    check_finished => 1,
-    );
-
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\[/);
-
-vcd_identical("$Self->{obj_dir}/simx.vcd", $Self->{golden_filename});
-
-ok(1);
-1;

From cf1eccc24f3d5f2224a37ec18a2590b2868fdc79 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sat, 28 May 2022 12:17:39 +0100
Subject: [PATCH 03/19] Make local function 'static' in verilated_profiler.h

---
 include/verilated_profiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp
index 1a5f16a36..ed25093d1 100644
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@@ -60,7 +60,7 @@ uint16_t VlExecutionRecord::getcpu() {
 //=============================================================================
 // VlExecutionProfiler implementation
 
-template <size_t N> size_t roundUptoMultipleOf(size_t value) {
+template <size_t N> static size_t roundUptoMultipleOf(size_t value) {
     static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
     size_t mask = N - 1;
     return (value + mask) & ~mask;

From a48c779367417115e194adbdd3acf7818beeb0f5 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sat, 28 May 2022 12:20:35 +0100
Subject: [PATCH 04/19] Rename verilated_trace_imp.cpp -> verilated_trace_imp.h

Also fix file header to describe purpose of this file.
---
 include/verilated_fst_c.cpp                        |  4 ++--
 ...rilated_trace_imp.cpp => verilated_trace_imp.h} | 14 +++++---------
 include/verilated_vcd_c.cpp                        |  4 ++--
 3 files changed, 9 insertions(+), 13 deletions(-)
 rename include/{verilated_trace_imp.cpp => verilated_trace_imp.h} (99%)

diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp
index 68431db71..3e1b27744 100644
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@@ -84,7 +84,7 @@ static_assert(static_cast<int>(FST_ST_VCD_PROGRAM) == static_cast<int>(VLT_TRACE
 // Specialization of the generics for this trace format
 
 #define VL_DERIVED_T VerilatedFst
-#include "verilated_trace_imp.cpp"
+#include "verilated_trace_imp.h"
 #undef VL_DERIVED_T
 
 //=============================================================================
@@ -246,7 +246,7 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst
 }
 
 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE
diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.h
similarity index 99%
rename from include/verilated_trace_imp.cpp
rename to include/verilated_trace_imp.h
index dac31ddac..ed0503fcb 100644
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.h
@@ -10,15 +10,11 @@
 // SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 //
 //=============================================================================
-///
-/// \file
-/// \brief Verilated common-format tracing implementation code
-///
-/// This file must be compiled and linked against all Verilated objects
-/// that use --trace.
-///
-/// Use "verilator --trace" to add this to the Makefile for the linker.
-///
+//
+// Verilated tracing implementation code template common to all formats.
+// This file is included by the format specific implementations and
+// should not be used otherwise.
+//
 //=============================================================================
 
 // clang-format off
diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp
index d2417bc3e..8e0008e3f 100644
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@@ -66,7 +66,7 @@ constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8;  // Size of a suffix entry
 // Specialization of the generics for this trace format
 
 #define VL_DERIVED_T VerilatedVcd
-#include "verilated_trace_imp.cpp"
+#include "verilated_trace_imp.h"
 #undef VL_DERIVED_T
 
 //=============================================================================
@@ -604,7 +604,7 @@ void VerilatedVcd::finishLine(uint32_t code, char* writep) {
 // emit* trace routines
 
 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE

From a7cd7a1ed989bf4fc2d220bd3ea09c87b5b08c78 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sat, 28 May 2022 12:29:36 +0100
Subject: [PATCH 05/19] Initialize VerilatedTrace members in class

---
 include/verilated_trace.h     | 28 ++++++++++++++--------------
 include/verilated_trace_imp.h | 19 +------------------
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index a88ce6b50..622cb936e 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -146,23 +146,23 @@ private:
             , m_userp{userp} {}
     };
 
-    uint32_t* m_sigs_oldvalp;  // Old value store
-    EData* m_sigs_enabledp;  // Bit vector of enabled codes (nullptr = all on)
-    uint64_t m_timeLastDump;  // Last time we did a dump
+    uint32_t* m_sigs_oldvalp = nullptr;  // Old value store
+    EData* m_sigs_enabledp = nullptr;  // Bit vector of enabled codes (nullptr = all on)
+    uint64_t m_timeLastDump = 0;  // Last time we did a dump
     std::vector<bool> m_sigs_enabledVec;  // Staging for m_sigs_enabledp
     std::vector<CallbackRecord> m_initCbs;  // Routines to initialize traciong
     std::vector<CallbackRecord> m_fullCbs;  // Routines to perform full dump
     std::vector<CallbackRecord> m_chgCbs;  // Routines to perform incremental dump
     std::vector<CallbackRecord> m_cleanupCbs;  // Routines to call at the end of dump
-    bool m_fullDump;  // Whether a full dump is required on the next call to 'dump'
-    uint32_t m_nextCode;  // Next code number to assign
-    uint32_t m_numSignals;  // Number of distinct signals
-    uint32_t m_maxBits;  // Number of bits in the widest signal
+    bool m_fullDump = true;  // Whether a full dump is required on the next call to 'dump'
+    uint32_t m_nextCode = 0;  // Next code number to assign
+    uint32_t m_numSignals = 0;  // Number of distinct signals
+    uint32_t m_maxBits = 0;  // Number of bits in the widest signal
     std::vector<std::string> m_namePrefixStack{""};  // Path prefixes to add to signal names
     std::vector<std::pair<int, std::string>> m_dumpvars;  // dumpvar() entries
-    char m_scopeEscape;
-    double m_timeRes;  // Time resolution (ns/ms etc)
-    double m_timeUnit;  // Time units (ns/ms etc)
+    char m_scopeEscape = '.';
+    double m_timeRes = 1e-9;  // Time resolution (ns/ms etc)
+    double m_timeUnit = 1e-0;  // Time units (ns/ms etc)
 
     void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord& cbRec)
         VL_MT_SAFE_EXCLUDES(m_mutex);
@@ -178,17 +178,17 @@ private:
 
 #ifdef VL_TRACE_OFFLOAD
     // Number of total offload buffers that have been allocated
-    uint32_t m_numOffloadBuffers;
+    uint32_t m_numOffloadBuffers = 0;
     // Size of offload buffers
-    size_t m_offloadBufferSize;
+    size_t m_offloadBufferSize = 0;
     // Buffers handed to worker for processing
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersToWorker;
     // Buffers returned from worker after processing
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersFromWorker;
     // Write pointer into current buffer
-    uint32_t* m_offloadBufferWritep;
+    uint32_t* m_offloadBufferWritep = nullptr;
     // End of offload buffer
-    uint32_t* m_offloadBufferEndp;
+    uint32_t* m_offloadBufferEndp = nullptr;
     // The offload worker thread itself
     std::unique_ptr<std::thread> m_workerThread;
 
diff --git a/include/verilated_trace_imp.h b/include/verilated_trace_imp.h
index ed0503fcb..e62e40cab 100644
--- a/include/verilated_trace_imp.h
+++ b/include/verilated_trace_imp.h
@@ -275,24 +275,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::onExit(void* selfp) {
 //=============================================================================
 // VerilatedTrace
 
-template <>
-VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
-    : m_sigs_oldvalp{nullptr}
-    , m_sigs_enabledp{nullptr}
-    , m_timeLastDump{0}
-    , m_fullDump{true}
-    , m_nextCode{0}
-    , m_numSignals{0}
-    , m_maxBits{0}
-    , m_scopeEscape{'.'}
-    , m_timeRes{1e-9}
-    , m_timeUnit {
-    1e-9
-}
-#ifdef VL_TRACE_OFFLOAD
-, m_numOffloadBuffers { 0 }
-#endif
-{
+template <> VerilatedTrace<VL_DERIVED_T>::VerilatedTrace() {
     set_time_unit(Verilated::threadContextp()->timeunitString());
     set_time_resolution(Verilated::threadContextp()->timeprecisionString());
 }

From c4b8675d77870489d1acf1e5b5a0fb8eb9b2f7e2 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sat, 28 May 2022 12:33:11 +0100
Subject: [PATCH 06/19] Always inline some small, hot trace routines

---
 include/verilated_trace.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index 622cb936e..3174ff7c1 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -300,7 +300,7 @@ public:
     // duck-typed void emitWData(uint32_t code, const WData* newvalp, int bits) = 0;
     // duck-typed void emitDouble(uint32_t code, double newval) = 0;
 
-    uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }
+    VL_ATTR_ALWINLINE inline uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }
 
     // Write to previous value buffer value and emit trace entry.
     void fullBit(uint32_t* oldp, CData newval);
@@ -373,23 +373,23 @@ public:
     // thread and are called chg*Impl
 
     // Check previous dumped value of signal. If changed, then emit trace entry
-    inline void CHG(Bit)(uint32_t* oldp, CData newval) {
+    VL_ATTR_ALWINLINE inline void CHG(Bit)(uint32_t* oldp, CData newval) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
     }
-    inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
     }
-    inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
     }
-    inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
     }
-    inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) {
         const uint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
         if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
     }
@@ -401,7 +401,7 @@ public:
             }
         }
     }
-    inline void CHG(Double)(uint32_t* oldp, double newval) {
+    VL_ATTR_ALWINLINE inline void CHG(Double)(uint32_t* oldp, double newval) {
         // cppcheck-suppress invalidPointerCast
         if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
     }

From b51f887567814a3353e95c7fa7d5e14186c1ab39 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Sun, 29 May 2022 19:08:39 +0100
Subject: [PATCH 07/19] Perform VCD tracing in parallel when using --threads
 (#3449)

VCD tracing is now parallelized using the same thread pool as the model.
We achieve this by breaking the top level trace functions into multiple
top level functions (as many as --threads), and after emitting the time
stamp to the VCD file on the main thread, we execute the tracing
functions in parallel on the same thread pool as the model (which we
pass to the trace file during registration), tracing into a secondary
per thread buffer. The main thread will then stitch (memcpy) the buffers
together into the output file.

This makes the `--trace-threads` option redundant with `--trace`, which
now only affects `--trace-fst`. FST tracing uses the previous offloading
scheme.

This obviously helps a lot in VCD tracing performance, and I have seen
better than Amdahl speedup, namely I get 3.9x on XiangShan 4T (2.7x on
OpenTitan 4T).
---
 Changes                       |   4 +
 bin/verilator                 |   2 +-
 docs/guide/exe_verilator.rst  |  18 +-
 docs/guide/verilating.rst     |  24 ++-
 include/verilated.h           |   2 +-
 include/verilated_fst_c.cpp   |  52 +++--
 include/verilated_fst_c.h     |  81 +++++---
 include/verilated_trace.h     | 182 +++++++++++++----
 include/verilated_trace_imp.h | 365 +++++++++++++++++++++++-----------
 include/verilated_vcd_c.cpp   | 158 ++++++++++++---
 include/verilated_vcd_c.h     | 156 ++++++++++-----
 include/verilatedos.h         |   4 +
 src/V3EmitCImp.cpp            |  16 +-
 src/V3EmitCMake.cpp           |   5 +-
 src/V3EmitMk.cpp              |   5 +-
 src/V3Options.cpp             |  14 +-
 src/V3Options.h               |   6 +-
 src/V3Trace.cpp               |  31 +--
 test_regress/driver.pl        |   1 -
 19 files changed, 811 insertions(+), 315 deletions(-)

diff --git a/Changes b/Changes
index a0eb2b1d1..8c9418c9f 100644
--- a/Changes
+++ b/Changes
@@ -11,6 +11,10 @@ contributors that suggested a given feature are shown in []. Thanks!
 Verilator 4.223 devel
 ==========================
 
+**Major:**
+
+* VCD tracing is now parallelized with --threads (#3449). [Geza Lore, Shunyao CAD]
+
 **Minor:**
 
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
diff --git a/bin/verilator b/bin/verilator
index b1ee97e73..40be6ba0f 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -405,7 +405,7 @@ detailed descriptions of these arguments.
     --trace-max-width <width>   Maximum array depth for tracing
     --trace-params              Enable tracing of parameters
     --trace-structs             Enable tracing structure names
-    --trace-threads <threads>   Enable waveform creation on separate threads
+    --trace-threads <threads>   Enable FST waveform creation on separate threads
     --trace-underscore          Enable tracing of _signals
      -U<var>                    Undefine preprocessor define
     --unroll-count <loops>      Tune maximum loop iterations
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 00e101ed0..6100dcd55 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -1041,7 +1041,8 @@ Summary:
    is not thread safe. With "--threads 1", the generated model is single
    threaded but may run in a multithreaded environment. With "--threads N",
    where N >= 2, the model is generated to run multithreaded on up to N
-   threads. See :ref:`Multithreading`.
+   threads. See :ref:`Multithreading`. This option also applies to
+   :vlopt:`--trace` (but not :vlopt:`--trace-fst`).
 
 .. option:: --threads-dpi all
 
@@ -1119,7 +1120,8 @@ Summary:
    Having tracing compiled in may result in some small performance losses,
    even when tracing is not turned on during model execution.
 
-   See also :vlopt:`--trace-threads` option.
+   When using :vlopt:`--threads`, VCD tracing is parallelized, using the
+   same number of threads as passed to :vlopt:`--threads`.
 
 .. option:: --trace-coverage
 
@@ -1173,12 +1175,12 @@ Summary:
 .. option:: --trace-threads *threads*
 
    Enable waveform tracing using separate threads. This is typically faster
-   in simulation runtime but uses more total compute. This option is
-   independent of, and works with, both :vlopt:`--trace` and
-   :vlopt:`--trace-fst`.  Different trace formats can take advantage of
-   more trace threads to varying degrees. Currently VCD tracing can utilize
-   at most "--trace-threads 1", and FST tracing can utilize at most
-   "--trace-threads 2". This overrides :vlopt:`--no-threads` .
+   in simulation runtime but uses more total compute. This option only
+   applies to :vlopt:`--trace-fst`. FST tracing can utilize at most
+   "--trace-threads 2". This overrides :vlopt:`--no-threads`.
+
+   This option is accepted, but has absolutely no effect with
+   :vlopt:`--trace`, which respects :vlopt:`--threads` instead.
 
 .. option:: --trace-underscore
 
diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst
index f443ca298..2af18c1f0 100644
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@@ -221,9 +221,13 @@ model, it may be beneficial to performance to adjust the
 influences the partitioning of the model by adjusting the assumed execution
 time of DPI imports.
 
-The :vlopt:`--trace-threads` options can be used to produce trace dumps
-using multiple threads. If :vlopt:`--trace-threads` is set without
-:vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
+When using :vlopt:`--trace` to perform VCD tracing, the VCD trace
+construction is parallelized using the same number of threads as specified
+with :vlopt:`--threads`, and is executed on the same thread pool as the model.
+
+The :vlopt:`--trace-threads` options can be used with :vlopt:`--trace-fst`
+to offload FST tracing using multiple threads. If :vlopt:`--trace-threads` is
+given without :vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
 :vlopt:`--threads 1 <--threads>`, i.e.: the support libraries will be
 thread safe.
 
@@ -231,12 +235,12 @@ With :vlopt:`--trace-threads 0 <--trace-threads>`, trace dumps are produced
 on the main thread. This again gives the highest single thread performance.
 
 With :vlopt:`--trace-threads {N} <--trace-threads>`, where N is at least 1,
-N additional threads will be created and managed by the trace files (e.g.:
-VerilatedVcdC or VerilatedFstC), to generate the trace dump. The main
-thread will be released to proceed with execution as soon as possible,
-though some blocking of the main thread is still necessary while capturing
-the trace. Different trace formats can utilize a various number of
-threads. See the :vlopt:`--trace-threads` option.
+up to N additional threads will be created and managed by the trace files
+(e.g.: VerilatedFstC), to offload construction of the trace dump. The main
+thread will be released to proceed with execution as soon as possible, though
+some blocking of the main thread is still necessary while capturing the
+trace. FST tracing can utilize up to 2 offload threads, so there is no use
+of setting :vlopt:`--trace-threads` higher than 2 at the moment.
 
 When running a multithreaded model, the default Linux task scheduler often
 works against the model, by assuming threads are short lived, and thus
@@ -441,7 +445,7 @@ SystemC include directories and link to the SystemC libraries.
 
 .. describe:: TRACE_THREADS
 
-   Optional. Generated multi-threaded trace dumping, same as
+   Optional. Generated multi-threaded FST trace dumping, same as
    "--trace-threads".
 
 .. describe:: TOP_MODULE
diff --git a/include/verilated.h b/include/verilated.h
index 804d7363a..f9cf79601 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -147,7 +147,7 @@ extern uint32_t VL_THREAD_ID() VL_MT_SAFE;
 
 #if VL_THREADED
 
-#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before relaxing
+#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before yielding
 
 /// Mutex, wrapped to allow -fthread_safety checks
 class VL_CAPABILITY("mutex") VerilatedMutex final {
diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp
index 3e1b27744..0bc1048cf 100644
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@@ -83,9 +83,11 @@ static_assert(static_cast<int>(FST_ST_VCD_PROGRAM) == static_cast<int>(VLT_TRACE
 //=============================================================================
 // Specialization of the generics for this trace format
 
-#define VL_DERIVED_T VerilatedFst
+#define VL_SUB_T VerilatedFst
+#define VL_BUF_T VerilatedFstBuffer
 #include "verilated_trace_imp.h"
-#undef VL_DERIVED_T
+#undef VL_SUB_T
+#undef VL_BUF_T
 
 //=============================================================================
 // VerilatedFst
@@ -111,7 +113,7 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {
 
     m_curScope.clear();
 
-    VerilatedTrace<VerilatedFst>::traceInit();
+    Super::traceInit();
 
     // Clear the scope stack
     auto it = m_curScope.begin();
@@ -133,14 +135,14 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {
 
 void VerilatedFst::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::closeBase();
+    Super::closeBase();
     fstWriterClose(m_fst);
     m_fst = nullptr;
 }
 
 void VerilatedFst::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::flushBase();
+    Super::flushBase();
     fstWriterFlushContext(m_fst);
 }
 
@@ -162,7 +164,7 @@ void VerilatedFst::declare(uint32_t code, const char* name, int dtypenum, fstVar
                            int lsb) {
     const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;
 
-    const bool enabled = VerilatedTrace<VerilatedFst>::declCode(code, name, bits, false);
+    const bool enabled = Super::declCode(code, name, bits, false);
     if (!enabled) return;
 
     std::string nameasstr = namePrefix() + name;
@@ -245,18 +247,42 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst
     declare(code, name, dtypenum, vardir, vartype, array, arraynum, false, 63, 0);
 }
 
+//=============================================================================
+// Get/commit trace buffer
+
+VerilatedFstBuffer* VerilatedFst::getTraceBuffer() { return new VerilatedFstBuffer{*this}; }
+
+void VerilatedFst::commitTraceBuffer(VerilatedFstBuffer* bufp) {
+#ifdef VL_TRACE_OFFLOAD
+    if (bufp->m_offloadBufferWritep) {
+        m_offloadBufferWritep = bufp->m_offloadBufferWritep;
+        return;  // Buffer will be deleted by the offload thread
+    }
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedFstBuffer implementation
+
+VerilatedFstBuffer::VerilatedFstBuffer(VerilatedFst& owner)
+    : VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer>{owner} {}
+
+//=============================================================================
+// Trace rendering primitives
+
 // Note: emit* are only ever called from one place (full* in
 // verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBit(uint32_t code, CData newval) {
+void VerilatedFstBuffer::emitBit(uint32_t code, CData newval) {
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedFstBuffer::emitCData(uint32_t code, CData newval, int bits) {
     char buf[VL_BYTESIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
@@ -264,7 +290,7 @@ void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedFstBuffer::emitSData(uint32_t code, SData newval, int bits) {
     char buf[VL_SHORTSIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
@@ -272,7 +298,7 @@ void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedFstBuffer::emitIData(uint32_t code, IData newval, int bits) {
     char buf[VL_IDATASIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
@@ -280,7 +306,7 @@ void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedFstBuffer::emitQData(uint32_t code, QData newval, int bits) {
     char buf[VL_QUADSIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
@@ -288,7 +314,7 @@ void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedFstBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
     int words = VL_WORDS_I(bits);
     char* wp = m_strbuf;
     // Convert the most significant word
@@ -304,6 +330,6 @@ void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitDouble(uint32_t code, double newval) {
+void VerilatedFstBuffer::emitDouble(uint32_t code, double newval) {
     fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h
index b622a1894..5131cc8cc 100644
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@@ -31,15 +31,19 @@
 #include <string>
 #include <vector>
 
+class VerilatedFstBuffer;
+
 //=============================================================================
 // VerilatedFst
 // Base class to create a Verilator FST dump
 // This is an internally used class - see VerilatedFstC for what to call from applications
 
-class VerilatedFst final : public VerilatedTrace<VerilatedFst> {
+class VerilatedFst final : public VerilatedTrace<VerilatedFst, VerilatedFstBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedFst, VerilatedFstBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedFst>;
+    friend Buffer;  // Give the buffer access to the private bits
 
     //=========================================================================
     // FST specific internals
@@ -60,31 +64,26 @@ protected:
     //=========================================================================
     // Implementation of VerilatedTrace interface
 
-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
     virtual void emitTimeChange(uint64_t timeui) override;
 
     // Hooks called from VerilatedTrace
     virtual bool preFullDump() override { return isOpen(); }
     virtual bool preChangeDump() override { return isOpen(); }
 
-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedFstBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedFstBuffer*) override;
 
 public:
     //=========================================================================
     // External interface to client code
-    // (All must be threadsafe)
 
+    // CONSTRUCTOR
     explicit VerilatedFst(void* fst = nullptr);
     ~VerilatedFst();
 
+    // METHODS - All must be thread safe
     // Open the file; call isOpen() to see if errors
     void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
     // Close the file
@@ -97,11 +96,6 @@ public:
     //=========================================================================
     // Internal interface to Verilator generated code
 
-    // Inside dumping routines, declare a data type
-    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
-                       const char** itemNamesp, const char** itemValuesp);
-
-    // Inside dumping routines, declare a signal
     void declBit(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                  fstVarType vartype, bool array, int arraynum);
     void declBus(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@@ -112,18 +106,55 @@ public:
                    fstVarType vartype, bool array, int arraynum, int msb, int lsb);
     void declDouble(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                     fstVarType vartype, bool array, int arraynum);
+
+    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
+                       const char** itemNamesp, const char** itemValuesp);
 };
 
 #ifndef DOXYGEN
 // Declare specialization here as it's used in VerilatedFstC just below
-template <> void VerilatedTrace<VerilatedFst>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::dumpvars(int level, const std::string& hier);
+template <> void VerilatedFst::Super::dump(uint64_t time);
+template <> void VerilatedFst::Super::set_time_unit(const char* unitp);
+template <> void VerilatedFst::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedFst::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedFst::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedFst::Super::dumpvars(int level, const std::string& hier);
 #endif
 
+//=============================================================================
+// VerilatedFstBuffer
+
+class VerilatedFstBuffer final : public VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedFst;
+    friend VerilatedFst::Super;
+
+    // The FST file handle
+    void* const m_fst = m_owner.m_fst;
+    // code to fstHande map, as an array
+    const fstHandle* const m_symbolp = m_owner.m_symbolp;
+    // String buffer long enough to hold maxBits() chars
+    char* const m_strbuf = m_owner.m_strbuf;
+
+public:
+    // CONSTRUCTOR
+    explicit VerilatedFstBuffer(VerilatedFst& owner);
+    ~VerilatedFstBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
 //=============================================================================
 // VerilatedFstC
 /// Create a FST dump file in C standalone (no SystemC) simulations.
diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index 3174ff7c1..7915c3645 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -22,28 +22,43 @@
 #ifndef VERILATOR_VERILATED_TRACE_H_
 #define VERILATOR_VERILATED_TRACE_H_
 
-#ifdef VL_TRACE_THREADED
-#define VL_TRACE_OFFLOAD
+// clang-format off
+
+// In FST mode, VL_TRACE_THREADED enables offloading, but only if we also have
+// the FST writer thread. This means with --trace-threads 1, we get the FST
+// writer thread only, and with --trace-threads 2 we get offloading as well
+#if defined(VL_TRACE_FST_WRITER_THREAD) && defined(VL_TRACE_THREADED)
+# define VL_TRACE_OFFLOAD
+#endif
+// VCD tracing can happen fully in parallel
+#if defined(VM_TRACE_VCD) && VM_TRACE_VCD && defined(VL_TRACE_THREADED)
+# define VL_TRACE_PARALLEL
 #endif
 
-// clang-format off
+#if defined(VL_TRACE_PARALLEL) && defined(VL_TRACE_OFFLOAD)
+# error "Cannot have VL_TRACE_PARALLEL and VL_TRACE_OFFLOAD together"
+#endif
 
 #include "verilated.h"
 #include "verilated_trace_defs.h"
 
 #include <bitset>
+#include <condition_variable>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #ifdef VL_TRACE_OFFLOAD
-# include <condition_variable>
 # include <deque>
 # include <thread>
 #endif
 
 // clang-format on
 
+class VlThreadPool;
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer;
+
 #ifdef VL_TRACE_OFFLOAD
 //=============================================================================
 // Offloaded tracing
@@ -106,7 +121,8 @@ public:
         CHG_WDATA = 0x6,
         CHG_DOUBLE = 0x8,
         // TODO: full..
-        TIME_CHANGE = 0xd,
+        TIME_CHANGE = 0xc,
+        TRACE_BUFFER = 0xd,
         END = 0xe,  // End of buffer
         SHUTDOWN = 0xf  // Shutdown worker thread, also marks end of buffer
     };
@@ -116,16 +132,22 @@ public:
 //=============================================================================
 // VerilatedTrace
 
-// VerilatedTrace uses F-bounded polymorphism to access duck-typed
-// implementations in the format specific derived class, which must be passed
-// as the type parameter T_Derived
-template <class T_Derived> class VerilatedTrace VL_NOT_FINAL {
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+template <class T_Trace, class T_Buffer> class VerilatedTrace VL_NOT_FINAL {
+    // Give the buffer (both base and derived) access to the private bits
+    friend VerilatedTraceBuffer<T_Trace, T_Buffer>;
+    friend T_Buffer;
+
 public:
+    using Buffer = T_Buffer;
+
     //=========================================================================
     // Generic tracing internals
 
-    using initCb_t = void (*)(void*, T_Derived*, uint32_t);  // Type of init callbacks
-    using dumpCb_t = void (*)(void*, T_Derived*);  // Type of all but init callbacks
+    using initCb_t = void (*)(void*, T_Trace*, uint32_t);  // Type of init callbacks
+    using dumpCb_t = void (*)(void*, Buffer*);  // Type of dump callbacks
+    using cleanupCb_t = void (*)(void*, T_Trace*);  // Type of cleanup callbacks
 
 private:
     struct CallbackRecord {
@@ -133,9 +155,10 @@ private:
         // (the one in Ubuntu 14.04 with GCC 4.8.4 in particular) use the
         // assignment operator on inserting into collections, so they don't work
         // with const fields...
-        union {
-            initCb_t m_initCb;  // The callback function
-            dumpCb_t m_dumpCb;  // The callback function
+        union {  // The callback
+            initCb_t m_initCb;
+            dumpCb_t m_dumpCb;
+            cleanupCb_t m_cleanupCb;
         };
         void* m_userp;  // The user pointer to pass to the callback (the symbol table)
         CallbackRecord(initCb_t cb, void* userp)
@@ -144,16 +167,46 @@ private:
         CallbackRecord(dumpCb_t cb, void* userp)
             : m_dumpCb{cb}
             , m_userp{userp} {}
+        CallbackRecord(cleanupCb_t cb, void* userp)
+            : m_cleanupCb{cb}
+            , m_userp{userp} {}
     };
 
-    uint32_t* m_sigs_oldvalp = nullptr;  // Old value store
+#ifdef VL_TRACE_PARALLEL
+    struct ParallelWorkerData {
+        const dumpCb_t m_cb;  // The callback
+        void* const m_userp;  // The use pointer to pass to the callback
+        Buffer* const m_bufp;  // The buffer pointer to pass to the callback
+        std::atomic<bool> m_ready{false};  // The ready flag
+        mutable VerilatedMutex m_mutex;  // Mutex for suspension until ready
+        std::condition_variable_any m_cv;  // Condition variable for suspension
+        bool m_waiting VL_GUARDED_BY(m_mutex) = false;  // Whether a thread is suspended in wait()
+
+        void wait();
+
+        ParallelWorkerData(dumpCb_t cb, void* userp, Buffer* bufp)
+            : m_cb{cb}
+            , m_userp{userp}
+            , m_bufp{bufp} {}
+    };
+
+    // Passed a ParallelWorkerData*, second argument is ignored
+    static void parallelWorkerTask(void*, bool);
+#endif
+
+    using ParallelCallbackMap = std::unordered_map<VlThreadPool*, std::vector<CallbackRecord>>;
+
+protected:
+    uint32_t* m_sigs_oldvalp = nullptr;  // Previous value store
     EData* m_sigs_enabledp = nullptr;  // Bit vector of enabled codes (nullptr = all on)
+private:
     uint64_t m_timeLastDump = 0;  // Last time we did a dump
     std::vector<bool> m_sigs_enabledVec;  // Staging for m_sigs_enabledp
-    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize traciong
-    std::vector<CallbackRecord> m_fullCbs;  // Routines to perform full dump
-    std::vector<CallbackRecord> m_chgCbs;  // Routines to perform incremental dump
+    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize tracing
+    ParallelCallbackMap m_fullCbs;  // Routines to perform full dump
+    ParallelCallbackMap m_chgCbs;  // Routines to perform incremental dump
     std::vector<CallbackRecord> m_cleanupCbs;  // Routines to call at the end of dump
+    std::vector<VlThreadPool*> m_threadPoolps;  // All thread pools, in insertion order
     bool m_fullDump = true;  // Whether a full dump is required on the next call to 'dump'
     uint32_t m_nextCode = 0;  // Next code number to assign
     uint32_t m_numSignals = 0;  // Number of distinct signals
@@ -164,12 +217,16 @@ private:
     double m_timeRes = 1e-9;  // Time resolution (ns/ms etc)
     double m_timeUnit = 1e-0;  // Time units (ns/ms etc)
 
+    void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex);
+
     void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord& cbRec)
         VL_MT_SAFE_EXCLUDES(m_mutex);
 
-    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->'
     // to access duck-typed functions to avoid a virtual function call.
-    T_Derived* self() { return static_cast<T_Derived*>(this); }
+    T_Trace* self() { return static_cast<T_Trace*>(this); }
+
+    void runParallelCallbacks(const ParallelCallbackMap& cbMap);
 
     // Flush any remaining data for this file
     static void onFlush(void* selfp) VL_MT_UNSAFE_ONE;
@@ -185,10 +242,14 @@ private:
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersToWorker;
     // Buffers returned from worker after processing
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersFromWorker;
+
+protected:
     // Write pointer into current buffer
     uint32_t* m_offloadBufferWritep = nullptr;
     // End of offload buffer
     uint32_t* m_offloadBufferEndp = nullptr;
+
+private:
     // The offload worker thread itself
     std::unique_ptr<std::thread> m_workerThread;
 
@@ -250,6 +311,10 @@ protected:
     virtual bool preFullDump() = 0;
     virtual bool preChangeDump() = 0;
 
+    // Trace buffer management
+    virtual Buffer* getTraceBuffer() = 0;
+    virtual void commitTraceBuffer(Buffer*) = 0;
+
 public:
     //=========================================================================
     // External interface to client code
@@ -270,19 +335,55 @@ public:
     // Call
     void dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex);
 
+    //=========================================================================
+    // Internal interface to Verilator generated code
+
     //=========================================================================
     // Non-hot path internal interface to Verilator generated code
 
     void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE;
-    void addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
+    void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE;
 
     void scopeEscape(char flag) { m_scopeEscape = flag; }
 
     void pushNamePrefix(const std::string&);
     void popNamePrefix(unsigned count = 1);
+};
 
+//=============================================================================
+// VerilatedTraceBuffer
+
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+// The format-specific hot-path methods use duck-typing via T_Buffer for performance.
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer VL_NOT_FINAL {
+    friend T_Trace;  // Give the trace file access to the private bits
+
+protected:
+    T_Trace& m_owner;  // The VerilatedTrace subclass that owns this buffer
+
+    // Previous value store
+    uint32_t* const m_sigs_oldvalp = m_owner.m_sigs_oldvalp;
+    // Bit vector of enabled codes (nullptr = all on)
+    EData* const m_sigs_enabledp = m_owner.m_sigs_enabledp;
+
+#ifdef VL_TRACE_OFFLOAD
+    // Write pointer into current buffer
+    uint32_t* m_offloadBufferWritep = m_owner.m_offloadBufferWritep;
+    // End of offload buffer
+    uint32_t* const m_offloadBufferEndp = m_owner.m_offloadBufferEndp;
+#endif
+
+    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // to access duck-typed functions to avoid a virtual function call.
+    inline T_Buffer* self() { return static_cast<T_Buffer*>(this); }
+
+    explicit VerilatedTraceBuffer(T_Trace& owner);
+    virtual ~VerilatedTraceBuffer() = default;
+
+public:
     //=========================================================================
     // Hot path internal interface to Verilator generated code
 
@@ -363,9 +464,13 @@ public:
         VL_DEBUG_IF(assert(m_offloadBufferWritep <= m_offloadBufferEndp););
     }
 
-#define CHG(name) chg##name##Impl
-#else
-#define CHG(name) chg##name
+#define chgBit chgBitImpl
+#define chgCData chgCDataImpl
+#define chgSData chgSDataImpl
+#define chgIData chgIDataImpl
+#define chgQData chgQDataImpl
+#define chgWData chgWDataImpl
+#define chgDouble chgDoubleImpl
 #endif
 
     // In non-offload mode, these are called directly by the trace callbacks,
@@ -373,27 +478,27 @@ public:
     // thread and are called chg*Impl
 
     // Check previous dumped value of signal. If changed, then emit trace entry
-    VL_ATTR_ALWINLINE inline void CHG(Bit)(uint32_t* oldp, CData newval) {
+    VL_ATTR_ALWINLINE inline void chgBit(uint32_t* oldp, CData newval) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
     }
-    VL_ATTR_ALWINLINE inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgCData(uint32_t* oldp, CData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
     }
-    VL_ATTR_ALWINLINE inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgSData(uint32_t* oldp, SData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
     }
-    VL_ATTR_ALWINLINE inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgIData(uint32_t* oldp, IData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
     }
-    VL_ATTR_ALWINLINE inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgQData(uint32_t* oldp, QData newval, int bits) {
         const uint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
         if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
     }
-    inline void CHG(WData)(uint32_t* oldp, const WData* newvalp, int bits) {
+    VL_ATTR_ALWINLINE inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
         for (int i = 0; i < (bits + 31) / 32; ++i) {
             if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
                 fullWData(oldp, newvalp, bits);
@@ -401,11 +506,20 @@ public:
             }
         }
     }
-    VL_ATTR_ALWINLINE inline void CHG(Double)(uint32_t* oldp, double newval) {
+    VL_ATTR_ALWINLINE inline void chgDouble(uint32_t* oldp, double newval) {
         // cppcheck-suppress invalidPointerCast
         if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
     }
 
-#undef CHG
+#ifdef VL_TRACE_OFFLOAD
+#undef chgBit
+#undef chgCData
+#undef chgSData
+#undef chgIData
+#undef chgQData
+#undef chgWData
+#undef chgDouble
+#endif
 };
+
 #endif  // guard
diff --git a/include/verilated_trace_imp.h b/include/verilated_trace_imp.h
index e62e40cab..d2ffa965c 100644
--- a/include/verilated_trace_imp.h
+++ b/include/verilated_trace_imp.h
@@ -20,12 +20,16 @@
 // clang-format off
 
 #ifndef VL_CPPCHECK
-#ifndef VL_DERIVED_T
+#if !defined(VL_SUB_T) || !defined(VL_BUF_T)
 # error "This file should be included in trace format implementations"
 #endif
 
 #include "verilated_intrinsics.h"
 #include "verilated_trace.h"
+#ifdef VL_TRACE_PARALLEL
+# include "verilated_threads.h"
+# include <list>
+#endif
 
 #if 0
 # include <iostream>
@@ -78,7 +82,7 @@ static std::string doubleToTimescale(double value) {
 //=========================================================================
 // Buffer management
 
-template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
+template <> uint32_t* VerilatedTrace<VL_SUB_T, VL_BUF_T>::getOffloadBuffer() {
     uint32_t* bufferp;
     // Some jitter is expected, so some number of alternative offlaod buffers are
     // required, but don't allocate more than 8 buffers.
@@ -97,7 +101,7 @@ template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
     return bufferp;
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32_t* buffp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::waitForOffloadBuffer(const uint32_t* buffp) {
     // Slow path code only called on flush/shutdown, so use a simple algorithm.
     // Collect buffers from worker and stash them until we get the one we want.
     std::deque<uint32_t*> stash;
@@ -112,7 +116,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32
 //=========================================================================
 // Worker thread
 
-template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain() {
     bool shutdown = false;
 
     do {
@@ -123,6 +127,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
 
         const uint32_t* readp = bufferp;
 
+        std::unique_ptr<VL_BUF_T> traceBufp;  // We own the passed tracebuffer
+
         while (true) {
             const uint32_t cmd = readp[0];
             const uint32_t top = cmd >> 4;
@@ -137,44 +143,44 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                 // CHG_* commands
             case VerilatedTraceOffloadCommand::CHG_BIT_0:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top);
-                chgBitImpl(oldp, 0);
+                traceBufp->chgBitImpl(oldp, 0);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_BIT_1:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top);
-                chgBitImpl(oldp, 1);
+                traceBufp->chgBitImpl(oldp, 1);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_CDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top);
                 // Bits stored in bottom byte of command
-                chgCDataImpl(oldp, *readp, top);
+                traceBufp->chgCDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_SDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top);
                 // Bits stored in bottom byte of command
-                chgSDataImpl(oldp, *readp, top);
+                traceBufp->chgSDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_IDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top);
                 // Bits stored in bottom byte of command
-                chgIDataImpl(oldp, *readp, top);
+                traceBufp->chgIDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_QDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top);
                 // Bits stored in bottom byte of command
-                chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
+                traceBufp->chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                 readp += 2;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_WDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top);
-                chgWDataImpl(oldp, readp, top);
+                traceBufp->chgWDataImpl(oldp, readp, top);
                 readp += VL_WORDS_I(top);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_DOUBLE:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top);
-                chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
+                traceBufp->chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
                 readp += 2;
                 continue;
 
@@ -187,9 +193,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                 readp += 2;
                 continue;
 
+            case VerilatedTraceOffloadCommand::TRACE_BUFFER:
+                VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top);
+                readp -= 1;  // No code in this command, undo increment
+                traceBufp.reset(*reinterpret_cast<VL_BUF_T* const*>(readp));
+                readp += 2;
+                continue;
+
                 //===
                 // Commands ending this buffer
-            case VerilatedTraceOffloadCommand::END: VL_TRACE_OFFLOAD_DEBUG("Command END"); break;
+            case VerilatedTraceOffloadCommand::END:  //
+                VL_TRACE_OFFLOAD_DEBUG("Command END");
+                break;
             case VerilatedTraceOffloadCommand::SHUTDOWN:
                 VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN");
                 shutdown = true;
@@ -198,8 +213,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
             //===
             // Unknown command
             default: {  // LCOV_EXCL_START
-                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN");
-                VL_PRINTF_MT("Trace command: 0x%08x\n", cmd);
+                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd);
                 VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command");
                 break;
             }  // LCOV_EXCL_STOP
@@ -217,7 +231,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
     } while (VL_LIKELY(!shutdown));
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::shutdownOffloadWorker() {
     // If the worker thread is not running, done..
     if (!m_workerThread) return;
 
@@ -237,7 +251,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
 //=============================================================================
 // Life cycle
 
-template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::closeBase() {
 #ifdef VL_TRACE_OFFLOAD
     shutdownOffloadWorker();
     while (m_numOffloadBuffers) {
@@ -247,7 +261,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
 #endif
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::flushBase() {
 #ifdef VL_TRACE_OFFLOAD
     // Hand an empty buffer to the worker thread
     uint32_t* const bufferp = getOffloadBuffer();
@@ -262,29 +276,29 @@ template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
 //=============================================================================
 // Callbacks to run on global events
 
-template <> void VerilatedTrace<VL_DERIVED_T>::onFlush(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush(void* selfp) {
     // This calls 'flush' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->flush();
+    reinterpret_cast<VL_SUB_T*>(selfp)->flush();
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::onExit(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit(void* selfp) {
     // This calls 'close' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->close();
+    reinterpret_cast<VL_SUB_T*>(selfp)->close();
 }
 
 //=============================================================================
 // VerilatedTrace
 
-template <> VerilatedTrace<VL_DERIVED_T>::VerilatedTrace() {
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::VerilatedTrace() {
     set_time_unit(Verilated::threadContextp()->timeunitString());
     set_time_resolution(Verilated::threadContextp()->timeprecisionString());
 }
 
-template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::~VerilatedTrace() {
     if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr);
     if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr);
-    Verilated::removeFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::removeExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::removeFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::removeExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
 #ifdef VL_TRACE_OFFLOAD
     closeBase();
 #endif
@@ -293,7 +307,7 @@ template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
 //=========================================================================
 // Internals available to format specific implementations
 
-template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::traceInit() VL_MT_UNSAFE {
     // Note: It is possible to re-open a trace file (VCD in particular),
     // so we must reset the next code here, but it must have the same number
     // of codes on re-open
@@ -338,8 +352,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
     }
 
     // Set callback so flush/abort will flush this file
-    Verilated::addFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::addExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::addFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::addExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
 
 #ifdef VL_TRACE_OFFLOAD
     // Compute offload buffer size. we need to be able to store a new value for
@@ -351,13 +365,13 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
 
     // Start the worker thread
     m_workerThread.reset(
-        new std::thread{&VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain, this});
+        new std::thread{&VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain, this});
 #endif
 }
 
 template <>
-bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
-                                            bool tri) {
+bool VerilatedTrace<VL_SUB_T, VL_BUF_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
+                                                  bool tri) {
     if (VL_UNCOVERABLE(!code)) {
         VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal");
     }
@@ -401,28 +415,30 @@ bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, ui
 //=========================================================================
 // Internals available to format specific implementations
 
-template <> std::string VerilatedTrace<VL_DERIVED_T>::timeResStr() const {
+template <> std::string VerilatedTrace<VL_SUB_T, VL_BUF_T>::timeResStr() const {
     return doubleToTimescale(m_timeRes);
 }
 
 //=========================================================================
 // External interface to client code
 
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
     m_timeUnit = timescaleToDouble(unitp);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
     set_time_unit(unit.c_str());
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
     m_timeRes = timescaleToDouble(unitp);
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
     set_time_resolution(unit.c_str());
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
     if (level == 0) {
         m_dumpvars.clear();  // empty = everything on
     } else {
@@ -435,7 +451,87 @@ void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier)
     }
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
+#ifdef VL_TRACE_PARALLEL
+template <>  //
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::parallelWorkerTask(void* datap, bool) {
+    ParallelWorkerData* const wdp = reinterpret_cast<ParallelWorkerData*>(datap);
+    // Run the task
+    wdp->m_cb(wdp->m_userp, wdp->m_bufp);
+    // Mark buffer as ready
+    const VerilatedLockGuard lock{wdp->m_mutex};
+    wdp->m_ready.store(true);
+    if (wdp->m_waiting) wdp->m_cv.notify_one();
+}
+
+template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWorkerData::wait() {
+    // Spin for a while, waiting for the buffer to become ready
+    for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+        if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return;
+        VL_CPU_RELAX();
+    }
+    // We have been spinning for a while, so yield the thread
+    VerilatedLockGuard lock{m_mutex};
+    m_waiting = true;
+    m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); });
+    m_waiting = false;
+}
+#endif
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runParallelCallbacks(const ParallelCallbackMap& cbMap) {
+    for (VlThreadPool* threadPoolp : m_threadPoolps) {
+#ifdef VL_TRACE_PARALLEL
+        // If tracing in parallel, dispatch to the thread pool (if exists)
+        if (threadPoolp && threadPoolp->numThreads()) {
+            // List of work items for thread (std::list, as ParallelWorkerData is not movable)
+            std::list<ParallelWorkerData> workerData;
+            // We use the whole pool + the main thread
+            const unsigned threads = threadPoolp->numThreads() + 1;
+            // Main thread executes all jobs with index % threads == 0
+            std::vector<ParallelWorkerData*> mainThreadWorkerData;
+            // The tracing callbacks to execute on this thread-pool
+            const auto& cbVec = cbMap.at(threadPoolp);
+            // Enuque all the jobs
+            for (unsigned i = 0; i < cbVec.size(); ++i) {
+                const CallbackRecord& cbr = cbVec[i];
+                // Always get the trace buffer on the main thread
+                Buffer* const bufp = getTraceBuffer();
+                // Create new work item
+                workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
+                // Grab the new work item
+                ParallelWorkerData* const itemp = &workerData.back();
+                // Enqueue task to thread pool, or main thread
+                if (unsigned rem = i % threads) {
+                    threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false);
+                } else {
+                    mainThreadWorkerData.push_back(itemp);
+                }
+            }
+            // Execute main thead jobs
+            for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
+                parallelWorkerTask(itemp, false);
+            }
+            // Commit all trace buffers in order
+            for (ParallelWorkerData& item : workerData) {
+                // Wait until ready
+                item.wait();
+                // Commit the buffer
+                commitTraceBuffer(item.m_bufp);
+            }
+            continue;
+        }
+#endif
+        // Fall back on sequential execution
+        for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) {
+            Buffer* const traceBufferp = getTraceBuffer();
+            cbr.m_dumpCb(cbr.m_userp, traceBufferp);
+            commitTraceBuffer(traceBufferp);
+        }
+    }
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
     // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE.
     // This does get the mutex, but if multiple threads are trying to dump
     // chances are the data being dumped will have other problems
@@ -483,20 +579,14 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
     // Run the callbacks
     if (VL_UNLIKELY(m_fullDump)) {
         m_fullDump = false;  // No more need for next dump to be full
-        for (uint32_t i = 0; i < m_fullCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_fullCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_fullCbs);
     } else {
-        for (uint32_t i = 0; i < m_chgCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_chgCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_chgCbs);
     }
 
     for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) {
         const CallbackRecord& cbr = m_cleanupCbs[i];
-        cbr.m_dumpCb(cbr.m_userp, self());
+        cbr.m_cleanupCb(cbr.m_userp, self());
     }
 
 #ifdef VL_TRACE_OFFLOAD
@@ -517,8 +607,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
 // Non-hot path internal interface to Verilator generated code
 
 template <>
-void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
-                                                     CallbackRecord& cbRec)
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addThreadPool(VlThreadPool* threadPoolp)
+    VL_MT_SAFE_EXCLUDES(m_mutex) {
+    const VerilatedLockGuard lock{m_mutex};
+    for (VlThreadPool* const poolp : m_threadPoolps) {
+        if (poolp == threadPoolp) return;
+    }
+    m_threadPoolps.push_back(threadPoolp);
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
+                                                           CallbackRecord& cbRec)
     VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
     if (VL_UNCOVERABLE(timeLastDump() != 0)) {  // LCOV_EXCL_START
@@ -529,91 +629,40 @@ void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>
     cbVec.push_back(cbRec);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
     addCallbackRecord(m_initCbs, cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addFullCb(dumpCb_t cb, void* userp,
+                                                   VlThreadPool* threadPoolp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_fullCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_fullCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addChgCb(dumpCb_t cb, void* userp,
+                                                  VlThreadPool* threadPoolp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_chgCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_chgCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
     addCallbackRecord(m_cleanupCbs, cbr);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::pushNamePrefix(const std::string& prefix) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::pushNamePrefix(const std::string& prefix) {
     m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::popNamePrefix(unsigned count) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::popNamePrefix(unsigned count) {
     while (count--) m_namePrefixStack.pop_back();
     assert(!m_namePrefixStack.empty());
 }
 
-//=========================================================================
-// Hot path internal interface to Verilator generated code
-
-// These functions must write the new value back into the old value store,
-// and subsequently call the format specific emit* implementations. Note
-// that this file must be included in the format specific implementation, so
-// the emit* functions can be inlined for performance.
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(uint32_t* oldp, CData newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitBit(code, newval);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitCData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitSData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitIData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<QData*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitQData(code, newval, bits);
-}
-
-template <>
-void VerilatedTrace<VL_DERIVED_T>::fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitWData(code, newvalp, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(uint32_t* oldp, double newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<double*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    // cppcheck-suppress invalidPointerCast
-    self()->emitDouble(code, newval);
-}
-
 //=========================================================================
 // Primitives converting binary values to strings...
 
@@ -704,4 +753,86 @@ static inline void cvtQDataToStr(char* dstp, QData value) {
 
 #define cvtEDataToStr cvtIDataToStr
 
+//=========================================================================
+// VerilatedTraceBuffer
+
+template <>  //
+VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::VerilatedTraceBuffer(VL_SUB_T& owner)
+    : m_owner{owner} {
+#ifdef VL_TRACE_OFFLOAD
+    if (m_offloadBufferWritep) {
+        using This = VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>*;
+        // Tack on the buffer address
+        static_assert(2 * sizeof(uint32_t) >= sizeof(This),
+                      "This should be enough on all plafrorms");
+        *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER;
+        *reinterpret_cast<This*>(m_offloadBufferWritep) = this;
+        m_offloadBufferWritep += 2;
+    }
+#endif
+}
+
+// These functions must write the new value back into the old value store,
+// and subsequently call the format specific emit* implementations. Note
+// that this file must be included in the format specific implementation, so
+// the emit* functions can be inlined for performance.
+
+template <>  //
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullBit(uint32_t* oldp, CData newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitBit(code, newval);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitCData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitSData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitIData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<QData*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitQData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullWData(uint32_t* oldp, const WData* newvalp,
+                                                         int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitWData(code, newvalp, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullDouble(uint32_t* oldp, double newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<double*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    // cppcheck-suppress invalidPointerCast
+    self()->emitDouble(code, newval);
+}
+
 #endif  // VL_CPPCHECK
diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp
index 8e0008e3f..9db71aabc 100644
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@@ -62,12 +62,23 @@ constexpr unsigned VL_TRACE_MAX_VCD_CODE_SIZE = 5;  // Maximum length of a VCD s
 // cache-lines.
 constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8;  // Size of a suffix entry
 
+//=============================================================================
+// Utility functions: TODO: put these in a common place and share them.
+
+template <size_t N> static size_t roundUpToMultipleOf(size_t value) {
+    static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
+    size_t mask = N - 1;
+    return (value + mask) & ~mask;
+}
+
 //=============================================================================
 // Specialization of the generics for this trace format
 
-#define VL_DERIVED_T VerilatedVcd
+#define VL_SUB_T VerilatedVcd
+#define VL_BUF_T VerilatedVcdBuffer
 #include "verilated_trace_imp.h"
-#undef VL_DERIVED_T
+#undef VL_SUB_T
+#undef VL_BUF_T
 
 //=============================================================================
 //=============================================================================
@@ -183,7 +194,7 @@ void VerilatedVcd::makeNameMap() {
     deleteNameMap();
     m_namemapp = new NameMap;
 
-    VerilatedTrace<VerilatedVcd>::traceInit();
+    Super::traceInit();
 
     // Though not speced, it's illegal to generate a vcd with signals
     // not under any module - it crashes at least two viewers.
@@ -218,13 +229,17 @@ VerilatedVcd::~VerilatedVcd() {
     if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr);
     deleteNameMap();
     if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr);
+#ifdef VL_TRACE_PARALLEL
+    assert(m_numBuffers == m_freeBuffers.size());
+    for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr);
+#endif
 }
 
 void VerilatedVcd::closePrev() {
     // This function is on the flush() call path
     if (!isOpen()) return;
 
-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
     bufferFlush();
     m_isOpen = false;
     m_filep->close();
@@ -251,14 +266,14 @@ void VerilatedVcd::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
         printStr(" $end\n");
     }
     closePrev();
-    // closePrev() called VerilatedTrace<VerilatedVcd>::flush(), so we just
+    // closePrev() called Super::flush(), so we just
     // need to shut down the tracing thread here.
-    VerilatedTrace<VerilatedVcd>::closeBase();
+    Super::closeBase();
 }
 
 void VerilatedVcd::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
     bufferFlush();
 }
 
@@ -277,12 +292,12 @@ void VerilatedVcd::printQuad(uint64_t n) {
     printStr(buf);
 }
 
-void VerilatedVcd::bufferResize(uint64_t minsize) {
+void VerilatedVcd::bufferResize(size_t minsize) {
     // minsize is size of largest write.  We buffer at least 8 times as much data,
     // writing when we are 3/4 full (with thus 2*minsize remaining free)
     if (VL_UNLIKELY(minsize > m_wrChunkSize)) {
         const char* oldbufp = m_wrBufp;
-        m_wrChunkSize = minsize * 2;
+        m_wrChunkSize = roundUpToMultipleOf<1024>(minsize * 2);
         m_wrBufp = new char[m_wrChunkSize * 8];
         std::memcpy(m_wrBufp, oldbufp, m_writep - oldbufp);
         m_writep = m_wrBufp + (m_writep - oldbufp);
@@ -463,14 +478,16 @@ void VerilatedVcd::declare(uint32_t code, const char* name, const char* wirep, b
                            int arraynum, bool tri, bool bussed, int msb, int lsb) {
     const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;
 
-    const bool enabled = VerilatedTrace<VerilatedVcd>::declCode(code, name, bits, tri);
+    const bool enabled = Super::declCode(code, name, bits, tri);
 
     if (m_suffixes.size() <= nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE) {
         m_suffixes.resize(nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0);
     }
 
-    // Make sure write buffer is large enough (one character per bit), plus header
-    bufferResize(bits + 1024);
+    // Keep upper bound on bytes a single signal cna emit into the buffer
+    m_maxSignalBytes = std::max<size_t>(m_maxSignalBytes, bits + 32);
+    // Make sure write buffer is large enough, plus header
+    bufferResize(m_maxSignalBytes + 1024);
 
     if (!enabled) return;
 
@@ -564,7 +581,71 @@ void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int a
 }
 
 //=============================================================================
-// Trace rendering prinitives
+// Get/commit trace buffer
+
+VerilatedVcdBuffer* VerilatedVcd::getTraceBuffer() {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // If no buffer available, allocate a new one
+    if (m_freeBuffers.empty()) {
+        constexpr size_t pageSize = 4096;
+        // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety
+        size_t startingSize = roundUpToMultipleOf<pageSize>(4 * m_maxSignalBytes);
+        m_freeBuffers.emplace_back(new char[startingSize], startingSize);
+        ++m_numBuffers;
+    }
+    // Grab a buffer
+    const auto pair = m_freeBuffers.back();
+    m_freeBuffers.pop_back();
+    // Return the buffer
+    return new VerilatedVcdBuffer{*this, pair.first, pair.second};
+#else
+    return new VerilatedVcdBuffer{*this};
+#endif
+}
+
+void VerilatedVcd::commitTraceBuffer(VerilatedVcdBuffer* bufp) {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // Resize output buffer. Note, we use the full size of the trace buffer, as
+    // this is a lot more stable than the actual occupancy of the trace buffer.
+    // This helps us to avoid re-allocations due to small size changes.
+    bufferResize(bufp->m_size);
+    // Compute occupancy of buffer
+    const size_t usedSize = bufp->m_writep - bufp->m_bufp;
+    // Copy to output buffer
+    std::memcpy(m_writep, bufp->m_bufp, usedSize);
+    // Adjust write pointer
+    m_writep += usedSize;
+    // Flush if necessary
+    bufferCheck();
+    // Put buffer back on free list
+    m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size);
+#else
+    // Needs adjusting for emitTimeChange
+    m_writep = bufp->m_writep;
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedVcdBuffer implementation
+
+#ifdef VL_TRACE_PARALLEL
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner}
+    , m_writep{bufp}
+    , m_bufp{bufp}
+    , m_size{size} {
+    adjustGrowp();
+}
+#else
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner} {}
+#endif
+
+//=============================================================================
+// Trace rendering primitives
 
 static inline void
 VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* suffixp) VL_ATTR_NO_SANITIZE_ALIGN;
@@ -589,15 +670,44 @@ static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* s
 #endif
 }
 
-void VerilatedVcd::finishLine(uint32_t code, char* writep) {
-    const char* const suffixp = m_suffixes.data() + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
+void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) {
+    const char* const suffixp = m_suffixes + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
     VL_DEBUG_IFDEF(assert(suffixp[0]););
     VerilatedVcdCCopyAndAppendNewLine(writep, suffixp);
 
     // Now write back the write pointer incremented by the actual size of the
     // suffix, which was stored in the last byte of the suffix buffer entry.
     m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
-    bufferCheck();
+
+#ifdef VL_TRACE_PARALLEL
+    // Double the size of the buffer if necessary
+    if (VL_UNLIKELY(m_writep >= m_growp)) {
+        // Compute occupied size of current buffer
+        const size_t usedSize = m_writep - m_bufp;
+        // We are always doubling the size
+        m_size *= 2;
+        // Allocate the new buffer
+        char* const newBufp = new char[m_size];
+        // Copy from current buffer to new buffer
+        std::memcpy(newBufp, m_bufp, usedSize);
+        // Delete current buffer
+        delete[] m_bufp;
+        // Make new buffer the current buffer
+        m_bufp = newBufp;
+        // Adjust write pointer
+        m_writep = m_bufp + usedSize;
+        // Adjust resize limit
+        adjustGrowp();
+    }
+#else
+    // Flush the write buffer if there's not enough space left for new information
+    // We only call this once per vector, so we need enough slop for a very wide "b###" line
+    if (VL_UNLIKELY(m_writep > m_wrFlushp)) {
+        m_owner.m_writep = m_writep;
+        m_owner.bufferFlush();
+        m_writep = m_owner.m_writep;
+    }
+#endif
 }
 
 //=============================================================================
@@ -608,7 +718,7 @@ void VerilatedVcd::finishLine(uint32_t code, char* writep) {
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBit(uint32_t code, CData newval) {
+void VerilatedVcdBuffer::emitBit(uint32_t code, CData newval) {
     // Don't prefetch suffix as it's a bit too late;
     char* wp = m_writep;
     *wp++ = '0' | static_cast<char>(newval);
@@ -616,7 +726,7 @@ void VerilatedVcd::emitBit(uint32_t code, CData newval) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedVcdBuffer::emitCData(uint32_t code, CData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
@@ -624,7 +734,7 @@ void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedVcdBuffer::emitSData(uint32_t code, SData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
@@ -632,7 +742,7 @@ void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedVcdBuffer::emitIData(uint32_t code, IData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
@@ -640,7 +750,7 @@ void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedVcdBuffer::emitQData(uint32_t code, QData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
@@ -648,7 +758,7 @@ void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedVcdBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
     int words = VL_WORDS_I(bits);
     char* wp = m_writep;
     *wp++ = 'b';
@@ -665,10 +775,10 @@ void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitDouble(uint32_t code, double newval) {
+void VerilatedVcdBuffer::emitDouble(uint32_t code, double newval) {
     char* wp = m_writep;
     // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(wp, m_wrChunkSize, "r%.16g", newval);
+    VL_SNPRINTF(wp, m_maxSignalBytes, "r%.16g", newval);
     wp += std::strlen(wp);
     finishLine(code, wp);
 }
diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h
index b1485e13b..0d83eb25d 100644
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@@ -28,39 +28,20 @@
 #include <string>
 #include <vector>
 
-class VerilatedVcd;
-
-//=============================================================================
-// VerilatedFile
-/// Class representing a file to write to. These virtual methods can be
-/// overrode for e.g. socket I/O.
-
-class VerilatedVcdFile VL_NOT_FINAL {
-private:
-    int m_fd = 0;  // File descriptor we're writing to
-public:
-    // METHODS
-    /// Construct a (as yet) closed file
-    VerilatedVcdFile() = default;
-    /// Close and destruct
-    virtual ~VerilatedVcdFile() = default;
-    /// Open a file with given filename
-    virtual bool open(const std::string& name) VL_MT_UNSAFE;
-    /// Close object's file
-    virtual void close() VL_MT_UNSAFE;
-    /// Write data to file (if it is open)
-    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
-};
+class VerilatedVcdBuffer;
+class VerilatedVcdFile;
 
 //=============================================================================
 // VerilatedVcd
 // Base class to create a Verilator VCD dump
 // This is an internally used class - see VerilatedVcdC for what to call from applications
 
-class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd> {
+class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedVcd>;
+    friend Buffer;  // Give the buffer access to the private bits
 
     //=========================================================================
     // VCD specific internals
@@ -74,9 +55,10 @@ private:
     int m_modDepth = 0;  // Depth of module hierarchy
 
     char* m_wrBufp;  // Output buffer
-    const char* m_wrFlushp;  // Output buffer flush trigger location
+    char* m_wrFlushp;  // Output buffer flush trigger location
     char* m_writep;  // Write pointer into output buffer
-    uint64_t m_wrChunkSize;  // Output buffer size
+    size_t m_wrChunkSize;  // Output buffer size
+    size_t m_maxSignalBytes = 0;  // Upper bound on number of bytes a single signal can generate
     uint64_t m_wroteBytes = 0;  // Number of bytes written to this file
 
     std::vector<char> m_suffixes;  // VCD line end string codes + metadata
@@ -84,7 +66,13 @@ private:
     using NameMap = std::map<const std::string, const std::string>;
     NameMap* m_namemapp = nullptr;  // List of names for the header
 
-    void bufferResize(uint64_t minsize);
+#ifdef VL_TRACE_PARALLEL
+    // Vector of free trace buffers as (pointer, size) pairs.
+    std::vector<std::pair<char*, size_t>> m_freeBuffers;
+    size_t m_numBuffers = 0;  // Number of trace buffers allocated
+#endif
+
+    void bufferResize(size_t minsize);
     void bufferFlush() VL_MT_UNSAFE_ONE;
     inline void bufferCheck() {
         // Flush the write buffer if there's not enough space left for new information
@@ -107,8 +95,6 @@ private:
 
     static char* writeCode(char* writep, uint32_t code);
 
-    void finishLine(uint32_t code, char* writep);
-
     // CONSTRUCTORS
     VL_UNCOPYABLE(VerilatedVcd);
 
@@ -116,27 +102,22 @@ protected:
     //=========================================================================
     // Implementation of VerilatedTrace interface
 
-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
     virtual void emitTimeChange(uint64_t timeui) override;
 
     // Hooks called from VerilatedTrace
     virtual bool preFullDump() override { return isOpen(); }
     virtual bool preChangeDump() override;
 
-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedVcdBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedVcdBuffer*) override;
 
 public:
     //=========================================================================
     // External interface to client code
 
+    // CONSTRUCTOR
     explicit VerilatedVcd(VerilatedVcdFile* filep = nullptr);
     ~VerilatedVcd();
 
@@ -144,7 +125,7 @@ public:
     // Set size in megabytes after which new file should be created
     void rolloverMB(uint64_t rolloverMB) { m_rolloverMB = rolloverMB; }
 
-    // METHODS
+    // METHODS - All must be thread safe
     // Open the file; call isOpen() to see if errors
     void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
     // Open next data-only file
@@ -167,15 +148,92 @@ public:
 };
 
 #ifndef DOXYGEN
-// Declare specializations here they are used in VerilatedVcdC just below
-template <> void VerilatedTrace<VerilatedVcd>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::dumpvars(int level, const std::string& hier);
+// Declare specialization here as it's used in VerilatedFstC just below
+template <> void VerilatedVcd::Super::dump(uint64_t time);
+template <> void VerilatedVcd::Super::set_time_unit(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedVcd::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedVcd::Super::dumpvars(int level, const std::string& hier);
 #endif  // DOXYGEN
 
+//=============================================================================
+// VerilatedVcdBuffer
+
+class VerilatedVcdBuffer final : public VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedVcd;
+    friend VerilatedVcd::Super;
+
+#ifdef VL_TRACE_PARALLEL
+    char* m_writep;  // Write pointer into m_bufp
+    char* m_bufp;  // The beginning of the trace buffer
+    size_t m_size;  // The size of the buffer at m_bufp
+    char* m_growp;  // Resize limit pointer
+#else
+    char* m_writep = m_owner.m_writep;  // Write pointer into output buffer
+    char* const m_wrFlushp = m_owner.m_wrFlushp;  // Output buffer flush trigger location
+#endif
+
+    // VCD line end string codes + metadata
+    const char* const m_suffixes = m_owner.m_suffixes.data();
+    // The maximum number of bytes a single signal can emit
+    const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes;
+
+    void finishLine(uint32_t code, char* writep);
+
+#ifdef VL_TRACE_PARALLEL
+    void adjustGrowp() {
+        m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes);
+        assert(m_growp >= m_bufp + m_maxSignalBytes);
+    }
+#endif
+
+public:
+    // CONSTRUCTOR
+#ifdef VL_TRACE_PARALLEL
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size);
+#else
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner);
+#endif
+    ~VerilatedVcdBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
+//=============================================================================
+// VerilatedFile
+/// Class representing a file to write to. These virtual methods can be
+/// overrode for e.g. socket I/O.
+
+class VerilatedVcdFile VL_NOT_FINAL {
+private:
+    int m_fd = 0;  // File descriptor we're writing to
+public:
+    // METHODS
+    /// Construct a (as yet) closed file
+    VerilatedVcdFile() = default;
+    /// Close and destruct
+    virtual ~VerilatedVcdFile() = default;
+    /// Open a file with given filename
+    virtual bool open(const std::string& name) VL_MT_UNSAFE;
+    /// Close object's file
+    virtual void close() VL_MT_UNSAFE;
+    /// Write data to file (if it is open)
+    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
+};
+
 //=============================================================================
 // VerilatedVcdC
 /// Class representing a VCD dump file in C standalone (no SystemC)
diff --git a/include/verilatedos.h b/include/verilatedos.h
index 28412cac4..6bacfe27b 100644
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@@ -40,6 +40,7 @@
 #ifdef __GNUC__
 # define VL_ATTR_ALIGNED(alignment) __attribute__((aligned(alignment)))
 # define VL_ATTR_ALWINLINE __attribute__((always_inline))
+# define VL_ATTR_NOINLINE __attribute__((noinline))
 # define VL_ATTR_COLD __attribute__((cold))
 # define VL_ATTR_HOT __attribute__((hot))
 # define VL_ATTR_NORETURN __attribute__((noreturn))
@@ -82,6 +83,9 @@
 #ifndef VL_ATTR_ALWINLINE
 # define VL_ATTR_ALWINLINE  ///< Attribute to inline, even when not optimizing
 #endif
+#ifndef VL_ATTR_NOINLINE
+# define VL_ATTR_NOINLINE  ///< Attribute to never inline, even when optimizing
+#endif
 #ifndef VL_ATTR_COLD
 # define VL_ATTR_COLD  ///< Attribute that function is rarely executed
 #endif
diff --git a/src/V3EmitCImp.cpp b/src/V3EmitCImp.cpp
index 0d979b143..c88648d3f 100644
--- a/src/V3EmitCImp.cpp
+++ b/src/V3EmitCImp.cpp
@@ -751,26 +751,26 @@ class EmitCTrace final : EmitCFunc {
         const string func = nodep->full() ? "full" : "chg";
         bool emitWidth = true;
         if (nodep->dtypep()->basicp()->isDouble()) {
-            puts("tracep->" + func + "Double");
+            puts("bufp->" + func + "Double");
             emitWidth = false;
         } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("tracep->" + func + "WData");
+            puts("bufp->" + func + "WData");
         } else if (nodep->isQuad()) {
-            puts("tracep->" + func + "QData");
+            puts("bufp->" + func + "QData");
         } else if (nodep->declp()->widthMin() > 16) {
-            puts("tracep->" + func + "IData");
+            puts("bufp->" + func + "IData");
         } else if (nodep->declp()->widthMin() > 8) {
-            puts("tracep->" + func + "SData");
+            puts("bufp->" + func + "SData");
         } else if (nodep->declp()->widthMin() > 1) {
-            puts("tracep->" + func + "CData");
+            puts("bufp->" + func + "CData");
         } else {
-            puts("tracep->" + func + "Bit");
+            puts("bufp->" + func + "Bit");
             emitWidth = false;
         }
 
         const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
         const uint32_t code = nodep->declp()->code() + offset;
-        puts(v3Global.opt.useTraceOffloadThread() && !nodep->full() ? "(base+" : "(oldp+");
+        puts(v3Global.opt.useTraceOffload() && !nodep->full() ? "(base+" : "(oldp+");
         puts(cvtToStr(code - nodep->baseCode()));
         puts(",");
         emitTraceValue(nodep, arrayindex);
diff --git a/src/V3EmitCMake.cpp b/src/V3EmitCMake.cpp
index 67e8a741c..7df71dfeb 100644
--- a/src/V3EmitCMake.cpp
+++ b/src/V3EmitCMake.cpp
@@ -113,9 +113,8 @@ class CMakeEmitter final {
         cmake_set_raw(*of, name + "_COVERAGE", v3Global.opt.coverage() ? "1" : "0");
         *of << "# Threaded output mode?  0/1/N threads (from --threads)\n";
         cmake_set_raw(*of, name + "_THREADS", cvtToStr(v3Global.opt.threads()));
-        *of << "# Threaded tracing output mode?  0/1/N threads (from --trace-threads)\n";
-        cmake_set_raw(*of, name + "_TRACE_THREADS",
-                      cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        *of << "# Threaded tracing output mode?  0/1/N threads (from --threads/--trace-threads)\n";
+        cmake_set_raw(*of, name + "_TRACE_THREADS", cvtToStr(v3Global.opt.vmTraceThreads()));
         cmake_set_raw(*of, name + "_TRACE_FST_WRITER_THREAD",
                       v3Global.opt.traceThreads() && v3Global.opt.traceFormat().fst() ? "1" : "0");
         *of << "# Struct output mode?  0/1 (from --trace-structs)\n";
diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp
index 429b78d33..b748d9553 100644
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@@ -73,9 +73,10 @@ public:
         of.puts("VM_TRACE_FST = ");
         of.puts(v3Global.opt.trace() && v3Global.opt.traceFormat().fst() ? "1" : "0");
         of.puts("\n");
-        of.puts("# Tracing threaded output mode?  0/1/N threads (from --trace-thread)\n");
+        of.puts(
+            "# Tracing threaded output mode?  0/1/N threads (from --threads/--trace-thread)\n");
         of.puts("VM_TRACE_THREADS = ");
-        of.puts(cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        of.puts(cvtToStr(v3Global.opt.vmTraceThreads()));
         of.puts("\n");
         of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n");
         of.puts("VM_TRACE_FST_WRITER_THREAD = ");
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 1b74f1062..93d23eb5e 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -775,8 +775,16 @@ void V3Options::notify() {
             && !v3Global.opt.xmlOnly());
     }
 
-    // --trace-threads implies --threads 1 unless explicitly specified
-    if (traceThreads() && !threads()) m_threads = 1;
+    if (trace()) {
+        // With --trace-fst, --trace-threads implies --threads 1 unless explicitly specified
+        if (traceFormat().fst() && traceThreads() && !threads()) m_threads = 1;
+
+        // With --trace, --trace-threads is ignored
+        if (traceFormat().vcd()) m_traceThreads = threads() ? 1 : 0;
+    }
+
+    UASSERT(!(useTraceParallel() && useTraceOffload()),
+            "Cannot use both parallel and offloaded tracing");
 
     // Default split limits if not specified
     if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit;
@@ -1350,7 +1358,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
     DECL_OPTION("-trace-threads", CbVal, [this, fl](const char* valp) {
         m_trace = true;
         m_traceThreads = std::atoi(valp);
-        if (m_traceThreads < 0) fl->v3fatal("--trace-threads must be >= 0: " << valp);
+        if (m_traceThreads < 1) fl->v3fatal("--trace-threads must be >= 1: " << valp);
     });
     DECL_OPTION("-trace-underscore", OnOff, &m_traceUnderscore);
 
diff --git a/src/V3Options.h b/src/V3Options.h
index 35a71ed31..b9b5ef8ff 100644
--- a/src/V3Options.h
+++ b/src/V3Options.h
@@ -518,8 +518,10 @@ public:
     int traceMaxArray() const { return m_traceMaxArray; }
     int traceMaxWidth() const { return m_traceMaxWidth; }
     int traceThreads() const { return m_traceThreads; }
-    bool useTraceOffloadThread() const {
-        return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst();
+    bool useTraceOffload() const { return trace() && traceFormat().fst() && traceThreads() > 1; }
+    bool useTraceParallel() const { return trace() && traceFormat().vcd() && threads() > 1; }
+    unsigned vmTraceThreads() const {
+        return useTraceParallel() ? threads() : useTraceOffload() ? 1 : 0;
     }
     int unrollCount() const { return m_unrollCount; }
     int unrollStmts() const { return m_unrollStmts; }
diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp
index 61d009b6f..9fa1b099a 100644
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@@ -180,6 +180,10 @@ private:
     TraceActivityVertex* const m_alwaysVtxp;  // "Always trace" vertex
     bool m_finding = false;  // Pass one of algorithm?
 
+    // Trace parallelism. Only VCD tracing can be parallelized at this time.
+    const uint32_t m_parallelism
+        = v3Global.opt.useTraceParallel() ? static_cast<uint32_t>(v3Global.opt.threads()) : 1;
+
     VDouble0 m_statUniqSigs;  // Statistic tracking
     VDouble0 m_statUniqCodes;  // Statistic tracking
 
@@ -388,7 +392,7 @@ private:
                 if (!it->second->duplicatep()) {
                     uint32_t cost = 0;
                     const AstTraceDecl* const declp = it->second->nodep();
-                    // The number of comparisons required by tracep->chg*
+                    // The number of comparisons required by bufp->chg*
                     cost += declp->isWide() ? declp->codeInc() : 1;
                     // Arrays are traced by element
                     cost *= declp->arrayRange().ranged() ? declp->arrayRange().elements() : 1;
@@ -494,7 +498,7 @@ private:
         };
         if (isTopFunc) {
             // Top functions
-            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "::Buffer* bufp");
             addInitStr(voidSelfAssign(m_topModp));
             addInitStr(symClassAssign());
             // Add global activity check to change dump functions
@@ -508,32 +512,33 @@ private:
                 m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true));
             }
             m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp));
-            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf);\n", true));
+            const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"};
+            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true));
         } else {
             // Sub functions
-            funcp->argTypes(v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp");
             // Setup base references. Note in rare occasions we can end up with an empty trace
             // sub function, hence the VL_ATTR_UNUSED attributes.
             if (full) {
                 // Full dump sub function
                 addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                           "tracep->oldp(vlSymsp->__Vm_baseCode);\n");
+                           "bufp->oldp(vlSymsp->__Vm_baseCode);\n");
             } else {
                 // Change dump sub function
-                if (v3Global.opt.useTraceOffloadThread()) {
+                if (v3Global.opt.useTraceOffload()) {
                     addInitStr("const uint32_t base VL_ATTR_UNUSED = "
                                "vlSymsp->__Vm_baseCode + "
                                + cvtToStr(baseCode) + ";\n");
-                    addInitStr("if (false && tracep) {}  // Prevent unused\n");
+                    addInitStr("if (false && bufp) {}  // Prevent unused\n");
                 } else {
                     addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                               "tracep->oldp(vlSymsp->__Vm_baseCode + "
+                               "bufp->oldp(vlSymsp->__Vm_baseCode + "
                                + cvtToStr(baseCode) + ");\n");
                 }
             }
             // Add call to top function
             AstCCall* const callp = new AstCCall(funcp->fileline(), funcp);
-            callp->argTypes("tracep");
+            callp->argTypes("bufp");
             topFuncp->addStmtsp(callp);
         }
         // Done
@@ -728,7 +733,7 @@ private:
         // We will split functions such that each have to dump roughly the same amount of data
         // for this we need to keep tack of the number of codes used by the trace functions.
         uint32_t nFullCodes = 0;  // Number of non-duplicate codes (need to go into full* dump)
-        uint32_t nChgCodes = 0;  // Number of non-consant codes (need to go in to chg* dump)
+        uint32_t nChgCodes = 0;  // Number of non-constant codes (need to go in to chg* dump)
         sortTraces(traces, nFullCodes, nChgCodes);
 
         UINFO(5, "nFullCodes: " << nFullCodes << " nChgCodes: " << nChgCodes << endl);
@@ -747,13 +752,11 @@ private:
         m_regFuncp->isLoose(true);
         m_topScopep->addActivep(m_regFuncp);
 
-        const int parallelism = 1;  // Note: will bump this later, code below works for any value
-
         // Create the full dump functions, also allocates signal numbers
-        createFullTraceFunction(traces, nFullCodes, parallelism);
+        createFullTraceFunction(traces, nFullCodes, m_parallelism);
 
         // Create the incremental dump functions
-        createChgTraceFunctions(traces, nChgCodes, parallelism);
+        createChgTraceFunctions(traces, nChgCodes, m_parallelism);
 
         // Remove refs to traced values from TraceDecl nodes, these have now moved under
         // TraceInc
diff --git a/test_regress/driver.pl b/test_regress/driver.pl
index ffcfac4a8..fbae94f92 100755
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@@ -924,7 +924,6 @@ sub compile_vlt_flags {
     unshift @verilator_flags, "--trace" if $opt_trace;
     my $threads = ::calc_threads($Vltmt_threads);
     unshift @verilator_flags, "--threads $threads" if $param{vltmt} && $checkflags !~ /-threads /;
-    unshift @verilator_flags, "--trace-threads 1" if $param{vltmt} && $checkflags =~ /-trace /;
     unshift @verilator_flags, "--trace-threads 2" if $param{vltmt} && $checkflags =~ /-trace-fst /;
     unshift @verilator_flags, "--debug-partition" if $param{vltmt};
     unshift @verilator_flags, "-CFLAGS -ggdb -LDFLAGS -ggdb" if $opt_gdbsim;

From 26b74521780850ae05bc832200a6e340ba6e4f28 Mon Sep 17 00:00:00 2001
From: Yutetsu TAKATSUKASA <y.takatsukasa@gmail.com>
Date: Mon, 30 May 2022 19:33:06 +0900
Subject: [PATCH 08/19]  Fix #3445 of BitOpTreeOpt (#3453)

* Tests: Check BitOpTree statistics in t_const_opt.

* Tests: Add a test to reproduce #3445

* Fix #3445. Don't forget LSB of frozen node in BitOpTreeOpt.

* Apply suggestions from code review

Co-authored-by: Geza Lore <gezalore@gmail.com>
---
 src/V3Const.cpp               | 40 +++++++++++++++++++------
 test_regress/t/t_const_opt.pl |  3 ++
 test_regress/t/t_const_opt.v  | 55 +++++++++++++++++++++++++++++++++--
 3 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/src/V3Const.cpp b/src/V3Const.cpp
index 759033185..3a3f89a14 100644
--- a/src/V3Const.cpp
+++ b/src/V3Const.cpp
@@ -299,7 +299,8 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
     LeafInfo* m_leafp = nullptr;  // AstConst or AstVarRef that currently looking for
     const AstNode* const m_rootp;  // Root of this AST subtree
 
-    std::vector<AstNode*> m_frozenNodes;  // Nodes that cannot be optimized
+    std::vector<std::pair<AstNode*, int>>
+        m_frozenNodes;  // Nodes that cannot be optimized, int is lsb
     std::vector<BitPolarityEntry> m_bitPolarities;  // Polarity of bits found during iterate()
     std::vector<std::unique_ptr<VarInfo>> m_varInfos;  // VarInfo for each variable, [0] is nullptr
 
@@ -487,7 +488,7 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
                     restorer.restoreNow();
                     // Reach past a cast then add to frozen nodes to be added to final reduction
                     if (const AstCCast* const castp = VN_CAST(opp, CCast)) opp = castp->lhsp();
-                    m_frozenNodes.push_back(opp);
+                    m_frozenNodes.emplace_back(opp, m_lsb);
                     m_failed = origFailed;
                     continue;
                 }
@@ -652,17 +653,21 @@ public:
             }
         }
 
+        std::map<int, std::vector<AstNode*>> frozenNodes;  // Group by LSB
         // Check if frozen terms are clean or not
-        for (AstNode* const termp : visitor.m_frozenNodes) {
+        for (const std::pair<AstNode*, int>& termAndLsb : visitor.m_frozenNodes) {
+            AstNode* const termp = termAndLsb.first;
             // Comparison operators are clean
-            if (VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
-                || VN_IS(termp, Gt) || VN_IS(termp, Gte)) {
+            if ((VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
+                 || VN_IS(termp, Gt) || VN_IS(termp, Gte))
+                && termAndLsb.second == 0) {
                 hasCleanTerm = true;
             } else {
                 // Otherwise, conservatively assume the frozen term is dirty
                 hasDirtyTerm = true;
                 UINFO(9, "Dirty frozen term: " << termp << endl);
             }
+            frozenNodes[termAndLsb.second].push_back(termp);
         }
 
         // Figure out if a final negation is required
@@ -672,7 +677,11 @@ public:
         const bool needsCleaning = visitor.isAndTree() ? !hasCleanTerm : hasDirtyTerm;
 
         // Add size of reduction tree to op count
-        resultOps += termps.size() + visitor.m_frozenNodes.size() - 1;
+        resultOps += termps.size() - 1;
+        for (const auto& lsbAndNodes : frozenNodes) {
+            if (lsbAndNodes.first > 0) ++resultOps;  // Needs AstShiftR
+            resultOps += lsbAndNodes.second.size();
+        }
         // Add final polarity flip in Xor tree
         if (needsFlip) ++resultOps;
         // Add final cleaning AND
@@ -681,7 +690,9 @@ public:
         if (debug() >= 9) {  // LCOV_EXCL_START
             cout << "Bitop tree considered: " << endl;
             for (AstNode* const termp : termps) termp->dumpTree("Reduced term: ");
-            for (AstNode* const termp : visitor.m_frozenNodes) termp->dumpTree("Frozen term: ");
+            for (const std::pair<AstNode*, int>& termp : visitor.m_frozenNodes)
+                termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second)
+                                      + ": ");
             cout << "Needs flipping: " << needsFlip << endl;
             cout << "Needs cleaning: " << needsCleaning << endl;
             cout << "Size: " << resultOps << " input size: " << visitor.m_ops << endl;
@@ -724,8 +735,19 @@ public:
             resultp = reduce(resultp, termp);
         }
         // Add any frozen terms to the reduction
-        for (AstNode* const frozenp : visitor.m_frozenNodes) {
-            resultp = reduce(resultp, frozenp->unlinkFrBack());
+        for (auto&& lsbAndNodes : frozenNodes) {
+            AstNode* termp = nullptr;
+            for (AstNode* const itemp : lsbAndNodes.second) {
+                termp = reduce(termp, itemp->unlinkFrBack());
+            }
+            if (lsbAndNodes.first > 0) {  // LSB is not 0, so shiftR
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstShiftR{termp->fileline(), termp,
+                                      new AstConst(termp->fileline(), AstConst::WidthedValue{},
+                                                   termp->width(), lsbAndNodes.first)};
+                termp->dtypep(dtypep);
+            }
+            resultp = reduce(resultp, termp);
         }
 
         // Set width of masks to expected result width. This is required to prevent later removal
diff --git a/test_regress/t/t_const_opt.pl b/test_regress/t/t_const_opt.pl
index 26143eb57..e30cb1cfb 100755
--- a/test_regress/t/t_const_opt.pl
+++ b/test_regress/t/t_const_opt.pl
@@ -18,5 +18,8 @@ execute(
     check_finished => 1,
     );
 
+if ($Self->{vlt}) {
+    file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 10);
+}
 ok(1);
 1;
diff --git a/test_regress/t/t_const_opt.v b/test_regress/t/t_const_opt.v
index be1e49c03..dcce9a20b 100644
--- a/test_regress/t/t_const_opt.v
+++ b/test_regress/t/t_const_opt.v
@@ -57,7 +57,8 @@ module t(/*AUTOARG*/
          $write("[%0t] cyc==%0d crc=%x sum=%x\n", $time, cyc, crc, sum);
          if (crc !== 64'hc77bb9b3784ea091) $stop;
          // What checksum will we end up with (above print should match)
-`define EXPECTED_SUM 64'hcae926ece668f35d
+`define EXPECTED_SUM 64'h194081987b76c71c
+
          if (sum !== `EXPECTED_SUM) $stop;
          $write("*-* All Finished *-*\n");
          $finish;
@@ -79,10 +80,11 @@ module Test(/*AUTOARG*/
    logic d0, d1, d2, d3, d4, d5, d6, d7;
    logic bug3182_out;
    logic bug3197_out;
+   logic bug3445_out;
 
    output logic o;
 
-   logic [6:0] tmp;
+   logic [7:0] tmp;
    assign o = ^tmp;
 
    always_ff @(posedge clk) begin
@@ -105,10 +107,12 @@ module Test(/*AUTOARG*/
       tmp[4] <= i[0] & (i[1] & (i[2] & (i[3] | d[4])));  // ConstBitOpTreeVisitor::m_frozenNodes
       tmp[5] <= bug3182_out;
       tmp[6] <= bug3197_out;
+      tmp[7] <= bug3445_out;
    end
 
    bug3182 i_bug3182(.in(d[4:0]), .out(bug3182_out));
    bug3197 i_bug3197(.clk(clk), .in(d), .out(bug3197_out));
+   bug3445 i_bug3445(.clk(clk), .in(d), .out(bug3445_out));
 
 endmodule
 
@@ -140,3 +144,50 @@ module bug3197(input wire clk, input wire [31:0] in, output out);
    wire tmp0 = (|d[38:0]);
    assign out = (d[39] | tmp0);
 endmodule
+
+
+// Bug #3445
+// An unoptimized node is kept as frozen node, but its LSB were not saved.
+// AST of RHS of result0 looks as below:
+//   AND(SHIFTR(AND(WORDSEL(ARRAYSEL(VARREF)), WORDSEL(ARRAYSEL(VARREF)))), 32'd11)
+//                  ~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~~~~~~~~
+// Two of WORDSELs are frozen nodes. They are under SHIFTR of 11 bits.
+//
+// Fixing #3445 needs to
+//  1. Take AstShiftR into op count when diciding optimizable or not
+//     (result0 in the test)
+//  2. Insert AstShiftR if LSB of the frozen node is not 0 (result1 in the test)
+module bug3445(input wire clk, input wire [31:0] in, output wire out);
+   logic [127:0] d;
+   always_ff @(posedge clk)
+      d <= {d[95:0], in};
+
+   typedef struct packed {
+      logic        a;
+      logic [ 2:0] b;
+      logic [ 2:0] c;
+      logic [ 1:0] d;
+      logic [ 7:0] e;
+      logic [31:0] f;
+      logic [ 3:0] g;
+      logic [31:0] h;
+      logic        i;
+      logic [41:0] j;
+   } packed_struct;
+   packed_struct st[2];
+
+   always_ff @(posedge clk) begin
+      st[0] <= d;
+      st[1] <= st[0];
+   end
+
+   logic result0, result1;
+   always_ff @(posedge clk) begin
+      // Cannot optimize further.
+      result0 <= (st[0].g[0] & st[0].h[0]) & (in[0] == 1'b0);
+      // There are redundant !in[0] terms. They should be simplified.
+      result1 <= (!in[0] & (st[1].g[0] & st[1].h[0])) & ((in[0] == 1'b0) & !in[0]);
+   end
+
+   assign out = result0 ^ result1;
+endmodule

From 606b35853b29ed67be6d8e48f1ac65b72acff8fd Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Mon, 30 May 2022 16:44:00 +0100
Subject: [PATCH 09/19] Configure and compile with C++17 on Ubuntu 22.04

The packaged libsystemc on Ubuntu 22.04 uses C++17, so default to that
on that platform. Keep C++14 elsewhere.
---
 configure.ac | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index f0abbb265..cea30eb50 100644
--- a/configure.ac
+++ b/configure.ac
@@ -348,14 +348,18 @@ AC_SUBST(CFG_CXXFLAGS_PROFILE)
 
 # Flag to select newest language standard supported
 # Macros work such that first option that passes is the one we take
-# Currently enabled c++14 due to packaged SystemC dependency
+# Currently enable c++17/c++14 due to packaged SystemC dependency
 # c++14 is the newest that Verilator is regressed to support
 # c++11 is the oldest that Verilator supports
 # gnu is requried for Cygwin to compile verilated.h successfully
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++20)
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++20)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+case "$(which lsb_release 2>&1 > /dev/null && lsb_release -d)" in
+*Ubuntu*22.04*)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+;;
+esac
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++11)

From 694919b9d19ea7a838728d3b87afe7ce0e0d32eb Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Mon, 30 May 2022 16:51:40 +0100
Subject: [PATCH 10/19] CI: add ubuntu-22.04 regressions

---
 .github/workflows/build.yml |  8 ++++++--
 ci/ci-install.bash          | 10 +++++++---
 configure.ac                |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 47b5f70b2..d122e1e0a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
         compiler:
           - { cc: clang, cxx: clang++ }
           - { cc: gcc,   cxx: g++     }
@@ -37,9 +37,11 @@ jobs:
         exclude:
           # Build pull requests only with ubuntu-20.04 and without m32
           - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
           - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
           # Build -m32 only on ubuntu-20.04
           - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
         include:
           # Build GCC 10 on ubuntu-20.04
           - os: ubuntu-20.04
@@ -95,7 +97,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
         compiler:
           - { cc: clang, cxx: clang++ }
           - { cc: gcc,   cxx: g++     }
@@ -104,9 +106,11 @@ jobs:
         exclude:
           # Build pull requests only with ubuntu-20.04 and without m32
           - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
           - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
           # Build -m32 only on ubuntu-20.04
           - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
         include:
           # Test with GCC 10 on ubuntu-20.04 without m32
           - {os: ubuntu-20.04, compiler: { cc: gcc-10, cxx: g++-10 }, m32: 0, suite: dist-vlt-0}
diff --git a/ci/ci-install.bash b/ci/ci-install.bash
index f258916b4..4f61f06c4 100755
--- a/ci/ci-install.bash
+++ b/ci/ci-install.bash
@@ -54,8 +54,12 @@ if [ "$CI_BUILD_STAGE_NAME" = "build" ]; then
 
   if [ "$CI_OS_NAME" = "linux" ]; then
     sudo apt-get update
-    sudo apt-get install libfl-dev libgoogle-perftools-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    sudo apt-get install libfl-dev ccache
+    if [ "$CI_RUNS_ON" != "ubuntu-22.04" ]; then
+      # Some conflict of libunwind verison on 22.04, can live without it for now
+      sudo apt-get install libgoogle-perftools-dev
+    fi
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
       sudo apt-get install libsystemc libsystemc-dev
     fi
     if [ "$COVERAGE" = 1 ]; then
@@ -85,7 +89,7 @@ elif [ "$CI_BUILD_STAGE_NAME" = "test" ]; then
     sudo apt-get update
     # libfl-dev needed for internal coverage's test runs
     sudo apt-get install gdb gtkwave lcov libfl-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
       sudo apt-get install libsystemc-dev
     fi
     if [ "$CI_M32" = 1 ]; then
diff --git a/configure.ac b/configure.ac
index cea30eb50..8c06cfc0a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -349,7 +349,7 @@ AC_SUBST(CFG_CXXFLAGS_PROFILE)
 # Flag to select newest language standard supported
 # Macros work such that first option that passes is the one we take
 # Currently enable c++17/c++14 due to packaged SystemC dependency
-# c++14 is the newest that Verilator is regressed to support
+# c++17 is the newest that Verilator is regularly tested to support
 # c++11 is the oldest that Verilator supports
 # gnu is requried for Cygwin to compile verilated.h successfully
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++20)

From c64a07fd098c81382f5bf033dee2fe7e8ebd1c41 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Mon, 30 May 2022 18:35:46 +0100
Subject: [PATCH 11/19] CI: fix cache keys in test jobs

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d122e1e0a..87310899f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -126,7 +126,7 @@ jobs:
       CI_M32: ${{ matrix.m32 }}
       CC: ${{ matrix.compiler.cc }}
       CXX: ${{ matrix.compiler.cxx }}
-      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${ matrix.suite }}
+      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${{ matrix.suite }}
       CCACHE_MAXSIZE: 64M # Per build matrix entry (2160M in total)
       VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz
     steps:

From 0c53d191139a717d142dda20921e2160e4c827a1 Mon Sep 17 00:00:00 2001
From: Huanghuang Zhou <huanghuang.zhou@terapines.com>
Date: Tue, 31 May 2022 19:10:58 +0800
Subject: [PATCH 12/19] Commentary: `InstrCountVisitor` documentation  (#3457)

Signed-off-by: huanghuang.zhou <huanghuang.zhou@terapines.com>
---
 docs/CONTRIBUTORS  | 1 +
 docs/internals.rst | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/CONTRIBUTORS b/docs/CONTRIBUTORS
index 8079639e2..d598cebd5 100644
--- a/docs/CONTRIBUTORS
+++ b/docs/CONTRIBUTORS
@@ -35,6 +35,7 @@ Guokai Chen
 Harald Heckmann
 Howard Su
 Huang Rui
+Huanghuang Zhou
 HungMingWu
 HyungKi Jeong
 Iru Cai
diff --git a/docs/internals.rst b/docs/internals.rst
index 62358f1d7..499e0fa12 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -274,7 +274,7 @@ path through the graph is the sum of macro-task execution costs. Sarkar
 does almost the same thing, except that he has nonzero estimates for
 synchronization costs.
 
-Verilator's cost estimates are assigned by ``InstrCountCostVisitor``.  This
+Verilator's cost estimates are assigned by ``InstrCountVisitor``.  This
 class is perhaps the most fragile piece of the multithread
 implementation. It's easy to have a bug where you count something cheap
 (eg. accessing one element of a huge array) as if it were expensive (eg.

From d64f979f996faee3e5a30a0cfc4e97863728cc35 Mon Sep 17 00:00:00 2001
From: Yutetsu TAKATSUKASA <y.takatsukasa@gmail.com>
Date: Wed, 1 Jun 2022 09:26:16 +0900
Subject: [PATCH 13/19] Fix BitOpTree optimization to consider polarity of
 frozen node (#3445) (#3459)

* Tests: add a test to another failing case of #3445

* Consider polarity as lsb in BitOpTree optimization.
---
 src/V3Const.cpp               | 47 ++++++++++++++++++++++++-----------
 test_regress/t/t_const_opt.pl |  2 +-
 test_regress/t/t_const_opt.v  | 36 ++++++++++++++++++---------
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/src/V3Const.cpp b/src/V3Const.cpp
index 3a3f89a14..9f39e2884 100644
--- a/src/V3Const.cpp
+++ b/src/V3Const.cpp
@@ -111,6 +111,15 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
         BitPolarityEntry() = default;
     };
 
+    struct FrozenNodeInfo final {  // Context when a frozen node is found
+        bool m_polarity;
+        int m_lsb;
+        bool operator<(const FrozenNodeInfo& other) const {
+            if (m_lsb != other.m_lsb) return m_lsb < other.m_lsb;
+            return m_polarity < other.m_polarity;
+        }
+    };
+
     class Restorer final {  // Restore the original state unless disableRestore() is called
         ConstBitOpTreeVisitor& m_visitor;
         const size_t m_polaritiesSize;
@@ -299,8 +308,8 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
     LeafInfo* m_leafp = nullptr;  // AstConst or AstVarRef that currently looking for
     const AstNode* const m_rootp;  // Root of this AST subtree
 
-    std::vector<std::pair<AstNode*, int>>
-        m_frozenNodes;  // Nodes that cannot be optimized, int is lsb
+    std::vector<std::pair<AstNode*, FrozenNodeInfo>>
+        m_frozenNodes;  // Nodes that cannot be optimized
     std::vector<BitPolarityEntry> m_bitPolarities;  // Polarity of bits found during iterate()
     std::vector<std::unique_ptr<VarInfo>> m_varInfos;  // VarInfo for each variable, [0] is nullptr
 
@@ -488,7 +497,7 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
                     restorer.restoreNow();
                     // Reach past a cast then add to frozen nodes to be added to final reduction
                     if (const AstCCast* const castp = VN_CAST(opp, CCast)) opp = castp->lhsp();
-                    m_frozenNodes.emplace_back(opp, m_lsb);
+                    m_frozenNodes.emplace_back(opp, FrozenNodeInfo{m_polarity, m_lsb});
                     m_failed = origFailed;
                     continue;
                 }
@@ -653,21 +662,21 @@ public:
             }
         }
 
-        std::map<int, std::vector<AstNode*>> frozenNodes;  // Group by LSB
+        std::map<FrozenNodeInfo, std::vector<AstNode*>> frozenNodes;  // Group by FrozenNodeInfo
         // Check if frozen terms are clean or not
-        for (const std::pair<AstNode*, int>& termAndLsb : visitor.m_frozenNodes) {
-            AstNode* const termp = termAndLsb.first;
+        for (const auto& frozenInfo : visitor.m_frozenNodes) {
+            AstNode* const termp = frozenInfo.first;
             // Comparison operators are clean
             if ((VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
                  || VN_IS(termp, Gt) || VN_IS(termp, Gte))
-                && termAndLsb.second == 0) {
+                && frozenInfo.second.m_lsb == 0) {
                 hasCleanTerm = true;
             } else {
                 // Otherwise, conservatively assume the frozen term is dirty
                 hasDirtyTerm = true;
                 UINFO(9, "Dirty frozen term: " << termp << endl);
             }
-            frozenNodes[termAndLsb.second].push_back(termp);
+            frozenNodes[frozenInfo.second].push_back(termp);
         }
 
         // Figure out if a final negation is required
@@ -679,7 +688,8 @@ public:
         // Add size of reduction tree to op count
         resultOps += termps.size() - 1;
         for (const auto& lsbAndNodes : frozenNodes) {
-            if (lsbAndNodes.first > 0) ++resultOps;  // Needs AstShiftR
+            if (lsbAndNodes.first.m_lsb > 0) ++resultOps;  // Needs AstShiftR
+            if (!lsbAndNodes.first.m_polarity) ++resultOps;  // Needs AstNot
             resultOps += lsbAndNodes.second.size();
         }
         // Add final polarity flip in Xor tree
@@ -690,8 +700,9 @@ public:
         if (debug() >= 9) {  // LCOV_EXCL_START
             cout << "Bitop tree considered: " << endl;
             for (AstNode* const termp : termps) termp->dumpTree("Reduced term: ");
-            for (const std::pair<AstNode*, int>& termp : visitor.m_frozenNodes)
-                termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second)
+            for (const std::pair<AstNode*, FrozenNodeInfo>& termp : visitor.m_frozenNodes)
+                termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second.m_lsb)
+                                      + " polarity " + std::to_string(termp.second.m_polarity)
                                       + ": ");
             cout << "Needs flipping: " << needsFlip << endl;
             cout << "Needs cleaning: " << needsCleaning << endl;
@@ -735,16 +746,22 @@ public:
             resultp = reduce(resultp, termp);
         }
         // Add any frozen terms to the reduction
-        for (auto&& lsbAndNodes : frozenNodes) {
+        for (auto&& nodes : frozenNodes) {
+            // nodes.second has same lsb and polarity
             AstNode* termp = nullptr;
-            for (AstNode* const itemp : lsbAndNodes.second) {
+            for (AstNode* const itemp : nodes.second) {
                 termp = reduce(termp, itemp->unlinkFrBack());
             }
-            if (lsbAndNodes.first > 0) {  // LSB is not 0, so shiftR
+            if (nodes.first.m_lsb > 0) {  // LSB is not 0, so shiftR
                 AstNodeDType* const dtypep = termp->dtypep();
                 termp = new AstShiftR{termp->fileline(), termp,
                                       new AstConst(termp->fileline(), AstConst::WidthedValue{},
-                                                   termp->width(), lsbAndNodes.first)};
+                                                   termp->width(), nodes.first.m_lsb)};
+                termp->dtypep(dtypep);
+            }
+            if (!nodes.first.m_polarity) {  // Polarity is inverted, so append Not
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstNot{termp->fileline(), termp};
                 termp->dtypep(dtypep);
             }
             resultp = reduce(resultp, termp);
diff --git a/test_regress/t/t_const_opt.pl b/test_regress/t/t_const_opt.pl
index e30cb1cfb..83e301744 100755
--- a/test_regress/t/t_const_opt.pl
+++ b/test_regress/t/t_const_opt.pl
@@ -19,7 +19,7 @@ execute(
     );
 
 if ($Self->{vlt}) {
-    file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 10);
+    file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 11);
 }
 ok(1);
 1;
diff --git a/test_regress/t/t_const_opt.v b/test_regress/t/t_const_opt.v
index dcce9a20b..407fef13c 100644
--- a/test_regress/t/t_const_opt.v
+++ b/test_regress/t/t_const_opt.v
@@ -4,6 +4,11 @@
 // any use, without warranty, 2021 Yutetsu TAKATSUKASA.
 // SPDX-License-Identifier: CC0-1.0
 
+// This function always returns 0, so safe to take bitwise OR with any value.
+// Calling this function stops constant folding as Verialtor does not know
+// what this function returns.
+import "DPI-C" context function int fake_dependency();
+
 module t(/*AUTOARG*/
    // Inputs
    clk
@@ -57,7 +62,7 @@ module t(/*AUTOARG*/
          $write("[%0t] cyc==%0d crc=%x sum=%x\n", $time, cyc, crc, sum);
          if (crc !== 64'hc77bb9b3784ea091) $stop;
          // What checksum will we end up with (above print should match)
-`define EXPECTED_SUM 64'h194081987b76c71c
+`define EXPECTED_SUM 64'hdccb9e7b8b638233
 
          if (sum !== `EXPECTED_SUM) $stop;
          $write("*-* All Finished *-*\n");
@@ -120,11 +125,6 @@ module bug3182(in, out);
    input wire [4:0] in;
    output wire out;
 
-   // This function always returns 0, so safe to take bitwise OR with any value.
-   // Calling this function stops constant folding as Verialtor does not know
-   // what this function returns.
-   import "DPI-C" context function int fake_dependency();
-
    logic [4:0] bit_source;
 
    /* verilator lint_off WIDTH */
@@ -147,16 +147,18 @@ endmodule
 
 
 // Bug #3445
-// An unoptimized node is kept as frozen node, but its LSB were not saved.
+// An unoptimized node is kept as frozen node, but its LSB and polarity were not saved.
 // AST of RHS of result0 looks as below:
 //   AND(SHIFTR(AND(WORDSEL(ARRAYSEL(VARREF)), WORDSEL(ARRAYSEL(VARREF)))), 32'd11)
 //                  ~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~~~~~~~~
 // Two of WORDSELs are frozen nodes. They are under SHIFTR of 11 bits.
 //
 // Fixing #3445 needs to
-//  1. Take AstShiftR into op count when diciding optimizable or not
-//     (result0 in the test)
+//  1. Take AstShiftR and AstNot into op count when diciding optimizable or not
+//     (result0 and result2 in the test)
 //  2. Insert AstShiftR if LSB of the frozen node is not 0 (result1 in the test)
+//  3. Insert AstNot if polarity of the frozen node is false (resutl3 in the
+//  test)
 module bug3445(input wire clk, input wire [31:0] in, output wire out);
    logic [127:0] d;
    always_ff @(posedge clk)
@@ -174,20 +176,30 @@ module bug3445(input wire clk, input wire [31:0] in, output wire out);
       logic        i;
       logic [41:0] j;
    } packed_struct;
-   packed_struct st[2];
+   packed_struct st[4];
 
+   // This is always 1'b0, but Verilator cannot notice it.
+   // This signal helps to reveal wrong optimization of result2 and result3.
+   logic zero;
    always_ff @(posedge clk) begin
       st[0] <= d;
       st[1] <= st[0];
+      st[2] <= st[1];
+      st[3] <= st[2];
+      zero <= fake_dependency() > 0;
    end
 
-   logic result0, result1;
+   logic result0, result1, result2, result3;
    always_ff @(posedge clk) begin
       // Cannot optimize further.
       result0 <= (st[0].g[0] & st[0].h[0]) & (in[0] == 1'b0);
       // There are redundant !in[0] terms. They should be simplified.
       result1 <= (!in[0] & (st[1].g[0] & st[1].h[0])) & ((in[0] == 1'b0) & !in[0]);
+      // Cannot optimize further.
+      result2 <= !(st[2].g[0] & st[2].h[0]) & (zero == 1'b0);
+      // There are redundant zero terms. They should be simplified.
+      result3 <= (!zero & !(st[3].g[0] & st[3].h[0])) & ((zero == 1'b0) & !zero);
    end
 
-   assign out = result0 ^ result1;
+   assign out = result0 ^ result1 ^ (result2 | result3);
 endmodule

From 6039e9dcc3d2c281279e2a2760887d71adcadda7 Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Thu, 2 Jun 2022 21:32:22 -0400
Subject: [PATCH 14/19] Commentary

---
 docs/guide/faq.rst | 43 +++++++++++++++++++++++++++++--------------
 docs/spelling.txt  |  3 +++
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/docs/guide/faq.rst b/docs/guide/faq.rst
index 5cc4acd43..0b70ea289 100644
--- a/docs/guide/faq.rst
+++ b/docs/guide/faq.rst
@@ -72,23 +72,38 @@ a good thing for getting working silicon.
 Will Verilator output remain under my own license/copyright?
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-Yes, it's just like using GCC on your programs; this is why Verilator uses
-the "GNU **Lesser** Public License Version 3" instead of the more typical
-"GNU Public License".  See the licenses for details, but in brief, if you
-change Verilator itself or the header files Verilator includes, you must
-make the source code available under the GNU Lesser Public License.
-However, Verilator output (the Verilated code) only "include"s the licensed
-files, and so you are **not** required to open-source release any output
-from Verilator.
+Your SystemVerilog, VPI/DPI, or main() C++ code remains under your own license.
+
+It's just like how using GCC on your programs does not change the copyright
+of your program; this is why Verilator uses the "GNU **Lesser** Public
+License Version 3" instead of the more typical "GNU Public License".  See
+the licenses for details.
+
+Some examples:
+
+* Any SystemVerilog or other input fed into Verilator remain your own.
+
+* Any of your VPI/DPI C++ routines that Verilator calls remain your own.
+
+* Any of your main() C++ code that calls into Verilator remain your own.
+
+* If you change Verilator itself, for example changing or adding a file
+  under the src/ directory in the repository, you must make the source code
+  available under the GNU Lesser Public License.
+
+* If you change a header Verilator provides, for example under include/ in
+  the repository, you must make the source code available under the GNU
+  Lesser Public License.
 
 You also have the option of using the Perl Artistic License, which again
-does not require you to release your Verilog or generated code, and also
-allows you to modify Verilator for internal use without distributing the
-modified version.  But please contribute back to the community!
+does not require you to release your Verilog, C++, or generated code. This
+license also allows you to modify Verilator for internal use without
+distributing the modified version.  But please contribute back to the
+community!
 
-One limit is that you cannot under either license release a closed-source
-Verilog simulation product incorporating Verilator. That is you can have a
-commercial product, but must make the source code available.
+Under both license you can offer a commercial product that is based on
+Verilator either directly or embedded within.  However under both licenses,
+any changes you make to Verilator for such a product must be open sourced.
 
 As is standard with Open Source, contributions back to Verilator will be
 placed under the Verilator copyright and LGPL/Artistic license.  Small test
diff --git a/docs/spelling.txt b/docs/spelling.txt
index 9014a6af6..0e423ba26 100644
--- a/docs/spelling.txt
+++ b/docs/spelling.txt
@@ -683,6 +683,7 @@ onehot
 ooo
 oprofile
 oversubscription
+parallelized
 param
 parameterized
 params
@@ -771,6 +772,7 @@ specparam
 splitme
 spp
 sqrt
+src
 srcdir
 srcfile
 sscanf
@@ -889,6 +891,7 @@ writeme
 writemem
 writememb
 writememh
+xiak
 xin
 xml
 xnor

From 1f3e8640f7eb795438f3e4baa648438ac6d859b2 Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Fri, 3 Jun 2022 18:45:39 -0400
Subject: [PATCH 15/19] Examples: -Os should be a compile flag, not Verilator
 flag.

---
 examples/cmake_tracing_c/CMakeLists.txt  | 2 +-
 examples/cmake_tracing_sc/CMakeLists.txt | 2 +-
 examples/make_protect_lib/Makefile       | 2 +-
 examples/make_tracing_c/Makefile         | 2 +-
 examples/make_tracing_sc/Makefile        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/cmake_tracing_c/CMakeLists.txt b/examples/cmake_tracing_c/CMakeLists.txt
index 522c20cc5..95fb3dfb2 100644
--- a/examples/cmake_tracing_c/CMakeLists.txt
+++ b/examples/cmake_tracing_c/CMakeLists.txt
@@ -33,5 +33,5 @@ add_executable(example ../make_tracing_c/sim_main.cpp)
 # Add the Verilated circuit to the target
 verilate(example COVERAGE TRACE
   INCLUDE_DIRS "../make_tracing_c"
-  VERILATOR_ARGS -f ../make_tracing_c/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_c/input.vc -x-assign fast
   SOURCES ../make_tracing_c/top.v)
diff --git a/examples/cmake_tracing_sc/CMakeLists.txt b/examples/cmake_tracing_sc/CMakeLists.txt
index 4651d1709..0d67a8cf5 100644
--- a/examples/cmake_tracing_sc/CMakeLists.txt
+++ b/examples/cmake_tracing_sc/CMakeLists.txt
@@ -45,7 +45,7 @@ set_property(
 # Add the Verilated circuit to the target
 verilate(example SYSTEMC COVERAGE TRACE
   INCLUDE_DIRS "../make_tracing_sc"
-  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -x-assign fast
   SOURCES ../make_tracing_sc/top.v)
 
 verilator_link_systemc(example)
diff --git a/examples/make_protect_lib/Makefile b/examples/make_protect_lib/Makefile
index 215df0396..359ece33e 100644
--- a/examples/make_protect_lib/Makefile
+++ b/examples/make_protect_lib/Makefile
@@ -33,7 +33,7 @@ VERILATOR_FLAGS =
 # Generate C++
 VERILATOR_FLAGS += -cc
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # This example does not use vl_time_stamp but rather
diff --git a/examples/make_tracing_c/Makefile b/examples/make_tracing_c/Makefile
index be77c71e4..e7dcaf244 100644
--- a/examples/make_tracing_c/Makefile
+++ b/examples/make_tracing_c/Makefile
@@ -36,7 +36,7 @@ VERILATOR_FLAGS += -cc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms
diff --git a/examples/make_tracing_sc/Makefile b/examples/make_tracing_sc/Makefile
index 80a6221b2..5f90a5ebf 100644
--- a/examples/make_tracing_sc/Makefile
+++ b/examples/make_tracing_sc/Makefile
@@ -37,7 +37,7 @@ VERILATOR_FLAGS += -sc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms

From 173f57c63639d5b4be2a229c468c84ffbfc0ecf8 Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Fri, 3 Jun 2022 19:41:59 -0400
Subject: [PATCH 16/19] Changed --no-merge-const-pool to -fno-merge-const-pool
 (#3436).

---
 Changes                                       |  1 +
 bin/verilator                                 |  2 +-
 docs/guide/exe_verilator.rst                  | 15 ++++----
 src/V3OptionParser.cpp                        | 34 ++++++++++++++++---
 src/V3OptionParser.h                          | 16 ++++++---
 src/V3Options.cpp                             |  2 +-
 src/V3Options.h                               |  4 +--
 src/V3Premit.cpp                              |  2 +-
 .../t/t_extract_static_const_no_merge.pl      |  2 +-
 9 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/Changes b/Changes
index 8c9418c9f..ec20c6fda 100644
--- a/Changes
+++ b/Changes
@@ -17,6 +17,7 @@ Verilator 4.223 devel
 
 **Minor:**
 
+* Changed --no-merge-const-pool to -fno-merge-const-pool (#3436).
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
 * Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
 * Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
diff --git a/bin/verilator b/bin/verilator
index 40be6ba0f..367651d32 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -319,6 +319,7 @@ detailed descriptions of these arguments.
      -f <file>                  Parse arguments from a file
      -FI <file>                 Force include of a file
     --flatten                   Force inlining of all modules, tasks and functions
+    --fno-merge-const-pool      Disable merging of different types in const pool
      -G<name>=<value>           Overwrite top-level parameter
     --gdb                       Run Verilator under GDB interactively
     --gdbbt                     Run Verilator under GDB for backtrace
@@ -344,7 +345,6 @@ detailed descriptions of these arguments.
     --MMD                       Create .d dependency files
     --MP                        Create phony dependency targets
     --Mdir <directory>          Name of output object directory
-    --no-merge-const-pool       Disable merging of different types in const pool
     --mod-prefix <topname>      Name to prepend to lower classes
     --no-clk <signal-name>      Prevent marking specified signal as clock
     --no-decoration             Disable comments and symbol decorations
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 6100dcd55..70b3752ad 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -431,6 +431,14 @@ Summary:
    flattening large designs may require significant CPU time, memory and
    storage.
 
+.. option:: --fno-merge-const-pool
+
+   Rarely needed; only use if recommended by maintainers.  In order to
+   minimize cache footprint, values of different data type, that are yet
+   emitted identically in C++ are merged in the constant pool.  This option
+   disables this and causes every constant pool entry with a distinct data
+   type to be emitted separately.
+
 .. option:: -G<name>=<value>
 
    Overwrites the given parameter of the toplevel module. The value is
@@ -648,13 +656,6 @@ Summary:
    The directory is created if it does not exist and the parent directories
    exist; otherwise manually create the Mdir before calling Verilator.
 
-.. option:: --no-merge-const-pool
-
-   Rarely needed.  In order to minimize cache footprint, values of different
-   data type, that are yet emitted identically in C++ are merged in the
-   constant pool.  This option disables this and causes every constant pool
-   entry with a distinct data type to be emitted separately.
-
 .. option:: --mod-prefix <topname>
 
    Specifies the name to prepend to all lower level classes.  Defaults to
diff --git a/src/V3OptionParser.cpp b/src/V3OptionParser.cpp
index 4439ba53d..d98b4fd90 100644
--- a/src/V3OptionParser.cpp
+++ b/src/V3OptionParser.cpp
@@ -30,6 +30,7 @@ struct V3OptionParser::Impl {
     // Setting for isOnOffAllowed() and isPartialMatchAllowed()
     enum class en : uint8_t {
         NONE,  // "-opt"
+        FONOFF,  // "-fopt" and "-fno-opt"
         ONOFF,  // "-opt" and "-no-opt"
         VALUE  // "-opt val"
     };
@@ -39,6 +40,7 @@ struct V3OptionParser::Impl {
         bool m_undocumented = false;  // This option is not documented
     public:
         virtual bool isValueNeeded() const override final { return MODE == en::VALUE; }
+        virtual bool isFOnOffAllowed() const override final { return MODE == en::FONOFF; }
         virtual bool isOnOffAllowed() const override final { return MODE == en::ONOFF; }
         virtual bool isPartialMatchAllowed() const override final { return ALLOW_PARTIAL_MATCH; }
         virtual bool isUndocumented() const override { return m_undocumented; }
@@ -47,6 +49,7 @@ struct V3OptionParser::Impl {
 
     // Actual action classes
     template <typename T> class ActionSet;  // "-opt" for bool-ish, "-opt val" for int and string
+    template <typename BOOL> class ActionFOnOff;  // "-fopt" and "-fno-opt" for bool-ish
     template <typename BOOL> class ActionOnOff;  // "-opt" and "-no-opt" for bool-ish
     class ActionCbCall;  // Callback without argument for "-opt"
     class ActionCbOnOff;  // Callback for "-opt" and "-no-opt"
@@ -80,6 +83,7 @@ V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, VOptionBool, m_valp->setTrueOrFalse(tru
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, int, *m_valp = std::atoi(argp), en::VALUE);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, string, *m_valp = argp, en::VALUE);
 
+V3OPTION_PARSER_DEF_ACT_CLASS(ActionFOnOff, bool, *m_valp = !hasPrefixFNo(optp), en::FONOFF);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, bool, *m_valp = !hasPrefixNo(optp), en::ONOFF);
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, VOptionBool, m_valp->setTrueOrFalse(!hasPrefixNo(optp)),
@@ -117,12 +121,23 @@ V3OPTION_PARSER_DEF_ACT_CB_CLASS(ActionCbPartialMatchVal, void(const char*, cons
 
 V3OptionParser::ActionIfs* V3OptionParser::find(const char* optp) {
     const auto it = m_pimpl->m_options.find(optp);
-    if (it != m_pimpl->m_options.end()) return it->second.get();
+    if (it != m_pimpl->m_options.end()) return it->second.get();  // Exact match
     for (auto&& act : m_pimpl->m_options) {
+        if (act.second->isFOnOffAllowed()) {  // Find starts with "-fno"
+            if (const char* const nop
+                = VString::startsWith(optp, "-fno-") ? (optp + strlen("-fno-")) : nullptr) {
+                if (act.first.substr(strlen("-f"), std::string::npos)
+                    == nop) {  // [-f]opt = [-fno-]opt
+                    return act.second.get();
+                }
+            }
+        }
         if (act.second->isOnOffAllowed()) {  // Find starts with "-no"
-            const char* const nop = VString::startsWith(optp, "-no") ? (optp + 3) : nullptr;
-            if (nop && (act.first == nop || act.first == (string{"-"} + nop))) {
-                return act.second.get();
+            if (const char* const nop
+                = VString::startsWith(optp, "-no") ? (optp + strlen("-no")) : nullptr) {
+                if (act.first == nop || act.first == (string{"-"} + nop)) {
+                    return act.second.get();
+                }
             }
         } else if (act.second->isPartialMatchAllowed()) {
             if (VString::startsWith(optp, act.first)) return act.second.get();
@@ -143,6 +158,12 @@ V3OptionParser::ActionIfs& V3OptionParser::add(const std::string& opt, ARG arg)
     return *insertedResult.first->second;
 }
 
+bool V3OptionParser::hasPrefixFNo(const char* strp) {
+    UASSERT(strp[0] == '-', strp << " does not start with '-'");
+    if (strp[1] == '-') ++strp;
+    return VString::startsWith(strp, "-fno");
+}
+
 bool V3OptionParser::hasPrefixNo(const char* strp) {
     UASSERT(strp[0] == '-', strp << " does not start with '-'");
     if (strp[1] == '-') ++strp;
@@ -178,6 +199,10 @@ void V3OptionParser::finalize() {
     for (auto&& opt : m_pimpl->m_options) {
         if (opt.second->isUndocumented()) continue;
         m_pimpl->m_spellCheck.pushCandidate(opt.first);
+        if (opt.second->isFOnOffAllowed()) {
+            m_pimpl->m_spellCheck.pushCandidate(
+                "-fno-" + opt.first.substr(strlen("-f"), std::string::npos));
+        }
         if (opt.second->isOnOffAllowed()) m_pimpl->m_spellCheck.pushCandidate("-no" + opt.first);
     }
     m_pimpl->m_isFinalized = true;
@@ -202,6 +227,7 @@ V3OPTION_PARSER_DEF_OP(Set, VOptionBool*, ActionSet<VOptionBool>)
 #endif
 V3OPTION_PARSER_DEF_OP(Set, int*, ActionSet<int>)
 V3OPTION_PARSER_DEF_OP(Set, string*, ActionSet<string>)
+V3OPTION_PARSER_DEF_OP(FOnOff, bool*, ActionFOnOff<bool>)
 V3OPTION_PARSER_DEF_OP(OnOff, bool*, ActionOnOff<bool>)
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_OP(OnOff, VOptionBool*, ActionOnOff<VOptionBool>)
diff --git a/src/V3OptionParser.h b/src/V3OptionParser.h
index fc199264f..e77f43a26 100644
--- a/src/V3OptionParser.h
+++ b/src/V3OptionParser.h
@@ -66,6 +66,7 @@ private:
     // METHODS
     ActionIfs* find(const char* optp);
     template <class ACT, class ARG> ActionIfs& add(const string& opt, ARG arg);
+    static bool hasPrefixFNo(const char* strp);  // Returns true if strp starts with "-fno"
     static bool hasPrefixNo(const char* strp);  // Returns true if strp starts with "-no"
 
 public:
@@ -87,6 +88,7 @@ class V3OptionParser::ActionIfs VL_NOT_FINAL {
 public:
     virtual ~ActionIfs() = default;
     virtual bool isValueNeeded() const = 0;  // Need val of "-opt val"
+    virtual bool isFOnOffAllowed() const = 0;  // true if "-fno-opt" is allowd
     virtual bool isOnOffAllowed() const = 0;  // true if "-no-opt" is allowd
     virtual bool isPartialMatchAllowed() const = 0;  // true if "-Wno-" matches "-Wno-fatal"
     virtual bool isUndocumented() const = 0;  // Will not be suggested in typo
@@ -101,13 +103,15 @@ class V3OptionParser::AppendHelper final {
 public:
     // TYPES
     // Tag to specify which operator() to call
-    struct Set {};  // For ActionSet
+    struct FOnOff {};  // For ActionFOnOff
     struct OnOff {};  // For ActionOnOff
+    struct Set {};  // For ActionSet
+
     struct CbCall {};  // For ActionCbCall
-    struct CbOnOff {};  // For ActionOnOff
-    struct CbVal {};  // For ActionCbVal
+    struct CbOnOff {};  // For ActionOnOff of ActionFOnOff
     struct CbPartialMatch {};  // For ActionCbPartialMatch
     struct CbPartialMatchVal {};  // For ActionCbPartialMatchVal
+    struct CbVal {};  // For ActionCbVal
 
 private:
     // MEMBERS
@@ -122,6 +126,7 @@ public:
     ActionIfs& operator()(const char* optp, Set, int*) const;
     ActionIfs& operator()(const char* optp, Set, string*) const;
 
+    ActionIfs& operator()(const char* optp, FOnOff, bool*) const;
     ActionIfs& operator()(const char* optp, OnOff, bool*) const;
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
     ActionIfs& operator()(const char* optp, OnOff, VOptionBool*) const;
@@ -144,13 +149,14 @@ public:
 
 #define V3OPTION_PARSER_DECL_TAGS \
     const auto Set VL_ATTR_UNUSED = V3OptionParser::AppendHelper::Set{}; \
+    const auto FOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::FOnOff{}; \
     const auto OnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::OnOff{}; \
     const auto CbCall VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbCall{}; \
     const auto CbOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbOnOff{}; \
-    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{}; \
     const auto CbPartialMatch VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbPartialMatch{}; \
     const auto CbPartialMatchVal VL_ATTR_UNUSED \
-        = V3OptionParser::AppendHelper::CbPartialMatchVal {}
+        = V3OptionParser::AppendHelper::CbPartialMatchVal{}; \
+    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{};
 
 //######################################################################
 
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 93d23eb5e..2a4c3050d 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -1082,6 +1082,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
         parseOptsFile(fl, parseFileArg(optdir, valp), false);
     });
     DECL_OPTION("-flatten", OnOff, &m_flatten);
+    DECL_OPTION("-fmerge-const-pool", FOnOff, &m_fMergeConstPool);
 
     DECL_OPTION("-G", CbPartialMatch, [this](const char* optp) { addParameter(optp, false); });
     DECL_OPTION("-gate-stmts", Set, &m_gateStmts);
@@ -1152,7 +1153,6 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
         }
     });
     DECL_OPTION("-max-num-width", Set, &m_maxNumWidth);
-    DECL_OPTION("-merge-const-pool", OnOff, &m_mergeConstPool);
     DECL_OPTION("-mod-prefix", Set, &m_modPrefix);
 
     DECL_OPTION("-O", CbPartialMatch, [this](const char* optp) {
diff --git a/src/V3Options.h b/src/V3Options.h
index b9b5ef8ff..e1756ab3d 100644
--- a/src/V3Options.h
+++ b/src/V3Options.h
@@ -239,6 +239,7 @@ private:
     bool m_dumpDefines = false;     // main switch: --dump-defines
     bool m_dumpTreeAddrids = false; // main switch: --dump-tree-addrids
     bool m_exe = false;             // main switch: --exe
+    bool m_fMergeConstPool = true;  // main switch: --fmerge-const-pool
     bool m_flatten = false;         // main switch: --flatten
     bool m_hierarchical = false;    // main switch: --hierarchical
     bool m_hierChild = false;       // main switch: --hierarchical-child
@@ -246,7 +247,6 @@ private:
     bool m_lintOnly = false;        // main switch: --lint-only
     bool m_gmake = false;           // main switch: --make gmake
     bool m_main = false;            // main swithc: --main
-    bool m_mergeConstPool = true;   // main switch: --merge-const-pool
     bool m_orderClockDly = true;    // main switch: --order-clock-delay
     bool m_outFormatOk = false;     // main switch: --cc, --sc or --sp was specified
     bool m_pedantic = false;        // main switch: --Wpedantic
@@ -448,6 +448,7 @@ public:
     bool dpiHdrOnly() const { return m_dpiHdrOnly; }
     bool dumpDefines() const { return m_dumpDefines; }
     bool exe() const { return m_exe; }
+    bool fMergeConstPool() const { return m_fMergeConstPool; }
     bool flatten() const { return m_flatten; }
     bool gmake() const { return m_gmake; }
     bool threadsDpiPure() const { return m_threadsDpiPure; }
@@ -459,7 +460,6 @@ public:
     bool traceStructs() const { return m_traceStructs; }
     bool traceUnderscore() const { return m_traceUnderscore; }
     bool main() const { return m_main; }
-    bool mergeConstPool() const { return m_mergeConstPool; }
     bool orderClockDly() const { return m_orderClockDly; }
     bool outFormatOk() const { return m_outFormatOk; }
     bool keepTempFiles() const { return (V3Error::debugDefault() != 0); }
diff --git a/src/V3Premit.cpp b/src/V3Premit.cpp
index 7501cd456..836b7c814 100644
--- a/src/V3Premit.cpp
+++ b/src/V3Premit.cpp
@@ -133,7 +133,7 @@ private:
                                   && !constp->num().isString();  // Not a string
         if (useConstPool) {
             // Extract into constant pool.
-            const bool merge = v3Global.opt.mergeConstPool();
+            const bool merge = v3Global.opt.fMergeConstPool();
             varp = v3Global.rootp()->constPoolp()->findConst(constp, merge)->varp();
             nodep->deleteTree();
             ++m_extractedToConstPool;
diff --git a/test_regress/t/t_extract_static_const_no_merge.pl b/test_regress/t/t_extract_static_const_no_merge.pl
index ff9a694d4..f656fe455 100755
--- a/test_regress/t/t_extract_static_const_no_merge.pl
+++ b/test_regress/t/t_extract_static_const_no_merge.pl
@@ -14,7 +14,7 @@ top_filename("t/t_extract_static_const.v");
 golden_filename("t/t_extract_static_const.out");
 
 compile(
-    verilator_flags2 => ["--stats", "--no-merge-const-pool"],
+    verilator_flags2 => ["--stats", "--fno-merge-const-pool"],
     );
 
 execute(

From ada58465b2141b2e9b451ca3bf017b1e1dd4bc6b Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Fri, 3 Jun 2022 20:43:16 -0400
Subject: [PATCH 17/19] Add -f<optimization> options to replace -O<letter>
 options (#3436).

---
 Changes                                       |   1 +
 bin/verilator                                 |   2 +-
 docs/guide/deprecations.rst                   |   5 +
 docs/guide/exe_verilator.rst                  |  54 +++++--
 src/V3Case.cpp                                |   2 +-
 src/V3Const.cpp                               |  10 +-
 src/V3Gate.cpp                                |   4 +-
 src/V3GraphAcyc.cpp                           |   2 +-
 src/V3Options.cpp                             | 133 ++++++++++--------
 src/V3Options.h                               |  85 ++++++-----
 src/Verilator.cpp                             |  32 ++---
 test_regress/driver.pl                        |  22 +--
 test_regress/t/t_altera_lpm_mult_noinl.pl     |   2 +-
 test_regress/t/t_alw_noreorder.pl             |   2 +-
 test_regress/t/t_assign_inline.pl             |   2 +-
 test_regress/t/t_assign_slice_overflow_ox.pl  |   2 +-
 test_regress/t/t_case_66bits_noexpand.pl      |   2 +-
 test_regress/t/t_case_incrdecr.pl             |   2 +-
 test_regress/t/t_case_write1_noexpand.pl      |   2 +-
 test_regress/t/t_const_no_opt.pl              |   2 +-
 test_regress/t/t_emit_constw.pl               |   2 +-
 test_regress/t/t_func_twocall_noexpand.pl     |   2 +-
 test_regress/t/t_gen_genblk_noinl.pl          |   2 +-
 test_regress/t/t_incr_void.pl                 |   2 +-
 test_regress/t/t_inst_slice_noinl.pl          |   2 +-
 test_regress/t/t_interface1_modport_noinl.pl  |   2 +-
 test_regress/t/t_interface1_noinl.pl          |   2 +-
 test_regress/t/t_interface2_noinl.pl          |   2 +-
 test_regress/t/t_interface_array2_noinl.pl    |   2 +-
 test_regress/t/t_interface_array_noinl.pl     |   2 +-
 test_regress/t/t_interface_down_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen10_noinl.pl     |   2 +-
 test_regress/t/t_interface_gen11_noinl.pl     |   2 +-
 test_regress/t/t_interface_gen12_noinl.pl     |   2 +-
 test_regress/t/t_interface_gen2_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen3_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen4_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen5_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen6_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen7_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen8_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen9_noinl.pl      |   2 +-
 test_regress/t/t_interface_gen_noinl.pl       |   2 +-
 test_regress/t/t_interface_inl.pl             |   2 +-
 .../t/t_interface_modport_import_noinl.pl     |   2 +-
 test_regress/t/t_interface_modport_inl.pl     |   2 +-
 test_regress/t/t_interface_modport_noinl.pl   |   2 +-
 test_regress/t/t_interface_mp_func_noinl.pl   |   2 +-
 test_regress/t/t_interface_nest_noinl.pl      |   2 +-
 test_regress/t/t_interface_noinl.pl           |   2 +-
 test_regress/t/t_interface_twod_noinl.pl      |   2 +-
 test_regress/t/t_lint_setout_bad_noinl.pl     |   2 +-
 test_regress/t/t_math_cond_huge_noexpand.pl   |   2 +-
 test_regress/t/t_math_div_noexpand.pl         |   2 +-
 test_regress/t/t_math_eq_noexpand.pl          |   2 +-
 test_regress/t/t_math_red_noexpand.pl         |   2 +-
 test_regress/t/t_math_shift_noexpand.pl       |   2 +-
 test_regress/t/t_math_signed_noexpand.pl      |   2 +-
 test_regress/t/t_math_vliw_noexpand.pl        |   2 +-
 test_regress/t/t_mem_multi_io.pl              |   2 +-
 test_regress/t/t_mem_multi_io2_cc.pl          |   2 +-
 test_regress/t/t_mem_multi_io2_sc.pl          |   2 +-
 test_regress/t/t_mem_multi_io3_cc.pl          |   2 +-
 test_regress/t/t_mem_multi_io3_sc.pl          |   2 +-
 test_regress/t/t_mem_multidim_Ox.pl           |   2 +-
 test_regress/t/t_mem_packed_noexpand.pl       |   2 +-
 .../t/t_mod_interface_array0_noinl.pl         |   2 +-
 .../t/t_mod_interface_array1_noinl.pl         |   2 +-
 .../t/t_mod_interface_array2_noinl.pl         |   2 +-
 .../t/t_mod_interface_array4_noinl.pl         |   2 +-
 .../t/t_mod_interface_array6_noinl.pl         |   2 +-
 test_regress/t/t_optm_if_cond.pl              |   2 +-
 test_regress/t/t_var_assign_landr_noexpand.pl |   2 +-
 73 files changed, 260 insertions(+), 214 deletions(-)

diff --git a/Changes b/Changes
index ec20c6fda..bfbc7dc92 100644
--- a/Changes
+++ b/Changes
@@ -17,6 +17,7 @@ Verilator 4.223 devel
 
 **Minor:**
 
+* Add -f<optimization> options to replace -O<letter> options (#3436).
 * Changed --no-merge-const-pool to -fno-merge-const-pool (#3436).
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
 * Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
diff --git a/bin/verilator b/bin/verilator
index 367651d32..f04e2a593 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -319,7 +319,7 @@ detailed descriptions of these arguments.
      -f <file>                  Parse arguments from a file
      -FI <file>                 Force include of a file
     --flatten                   Force inlining of all modules, tasks and functions
-    --fno-merge-const-pool      Disable merging of different types in const pool
+    --fno-<optimization>        Disable internal optimization stage
      -G<name>=<value>           Overwrite top-level parameter
     --gdb                       Run Verilator under GDB interactively
     --gdbbt                     Run Verilator under GDB for backtrace
diff --git a/docs/guide/deprecations.rst b/docs/guide/deprecations.rst
index 33c2ef610..4c2d96592 100644
--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@@ -20,6 +20,11 @@ Option `--cdc`
   The experimental `--cdc` option is believed to be generally unused and is
   planned for removal no sooner than January 2023.
 
+Option `--O<letter>`
+  The debug `--O<letter>` options have been replaced with
+  `--fno-<optimization>` debug options to match GCC. The old options are
+  planned for removal no sooner than June 2023.
+
 Option `--prof-threads`
   The `--prof-threads` option has been superseded by the `--prof-exec` and
   `--prof-pgo` options and is planned for removal no sooner than April 2023.
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 70b3752ad..fda5aedb3 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -431,13 +431,51 @@ Summary:
    flattening large designs may require significant CPU time, memory and
    storage.
 
+.. option:: --fno-acyc-simp
+
+.. option:: --fno-assemble
+
+.. option:: --fno-case
+
+.. option:: --fno-combine
+
+.. option:: --fno-const
+
+.. option:: --fno-const-bit-op-tree
+
+.. option:: --fno-dedup
+
+.. option:: --fno-expand
+
+.. option:: --fno-gate
+
+.. option:: --fno-inline
+
+.. option:: --fno-life
+
+.. option:: --fno-life-post
+
+.. option:: --fno-localize
+
+.. option:: --fno-merge-cond
+
 .. option:: --fno-merge-const-pool
 
-   Rarely needed; only use if recommended by maintainers.  In order to
-   minimize cache footprint, values of different data type, that are yet
-   emitted identically in C++ are merged in the constant pool.  This option
-   disables this and causes every constant pool entry with a distinct data
-   type to be emitted separately.
+.. option:: --fno-reloop
+
+.. option:: --fno-reorder
+
+.. option:: --fno-split
+
+.. option:: --fno-subst
+
+.. option:: --fno-subst-const
+
+.. option:: --fno-table
+
+   Rarely needed. Disables one of the internal optimization steps. These
+   are typically used only when recommended by a maintainer to help debug
+   or work around an issue.
 
 .. option:: -G<name>=<value>
 
@@ -704,9 +742,9 @@ Summary:
 
    Rarely needed.  Enables or disables a specific optimizations, with the
    optimization selected based on the letter passed.  A lowercase letter
-   disables an optimization, an upper case letter enables it.  This is
-   intended for debugging use only; see the source code for
-   version-dependent mappings of optimizations to -O letters.
+   disables an optimization, an upper case letter enables it.  This option
+   is deprecated and the various `-f<optimization>` arguments should be
+   used instead.
 
 .. option:: -o <executable>
 
diff --git a/src/V3Case.cpp b/src/V3Case.cpp
index 161f7db7e..c65fb3e7d 100644
--- a/src/V3Case.cpp
+++ b/src/V3Case.cpp
@@ -496,7 +496,7 @@ private:
         V3Case::caseLint(nodep);
         iterateChildren(nodep);
         if (debug() >= 9) nodep->dumpTree(cout, " case_old: ");
-        if (isCaseTreeFast(nodep) && v3Global.opt.oCase()) {
+        if (isCaseTreeFast(nodep) && v3Global.opt.fCase()) {
             // It's a simple priority encoder or complete statement
             // we can make a tree of statements to avoid extra comparisons
             ++m_statCaseFast;
diff --git a/src/V3Const.cpp b/src/V3Const.cpp
index 9f39e2884..e246180a2 100644
--- a/src/V3Const.cpp
+++ b/src/V3Const.cpp
@@ -1090,7 +1090,7 @@ private:
 
     bool matchBitOpTree(AstNode* nodep) {
         if (nodep->widthMin() != 1) return false;
-        if (!v3Global.opt.oConstBitOpTree()) return false;
+        if (!v3Global.opt.fConstBitOpTree()) return false;
 
         string debugPrefix;
         if (debug() >= 9) {  // LCOV_EXCL_START
@@ -1412,7 +1412,7 @@ private:
         return (VN_IS(nodep, And) || VN_IS(nodep, Or) || VN_IS(nodep, Xor));
     }
     bool ifAdjacentSel(const AstSel* lhsp, const AstSel* rhsp) {
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         if (!lhsp || !rhsp) return false;
         const AstNode* const lfromp = lhsp->fromp();
         const AstNode* const rfromp = rhsp->fromp();
@@ -1427,7 +1427,7 @@ private:
     }
     bool ifMergeAdjacent(AstNode* lhsp, AstNode* rhsp) {
         // called by concatmergeable to determine if {lhsp, rhsp} make sense
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         // two same varref
         if (operandsSame(lhsp, rhsp)) return true;
         const AstSel* lselp = VN_CAST(lhsp, Sel);
@@ -1464,7 +1464,7 @@ private:
     }
     bool concatMergeable(const AstNode* lhsp, const AstNode* rhsp, unsigned depth) {
         // determine if {a OP b, c OP d} => {a, c} OP {b, d} is advantageous
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         if (lhsp->type() != rhsp->type()) return false;
         if (!ifConcatMergeableBiop(lhsp)) return false;
         if (depth > CONCAT_MERGABLE_MAX_DEPTH) return false;  // As worse case O(n^2) algorithm
@@ -2550,7 +2550,7 @@ private:
             if (nodep->access().isReadOnly()
                 && ((!m_params  // Can reduce constant wires into equations
                      && m_doNConst
-                     && v3Global.opt.oConst()
+                     && v3Global.opt.fConst()
                      // Default value, not a "known" constant for this usage
                      && !nodep->varp()->isClassMember()
                      && !(nodep->varp()->isFuncLocal() && nodep->varp()->isNonOutput())
diff --git a/src/V3Gate.cpp b/src/V3Gate.cpp
index 4b66c2661..cf3485121 100644
--- a/src/V3Gate.cpp
+++ b/src/V3Gate.cpp
@@ -397,11 +397,11 @@ private:
         // Then propagate more complicated equations
         optimizeSignals(true);
         // Remove redundant logic
-        if (v3Global.opt.oDedupe()) {
+        if (v3Global.opt.fDedupe()) {
             dedupe();
             if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_dedup");
         }
-        if (v3Global.opt.oAssemble()) {
+        if (v3Global.opt.fAssemble()) {
             mergeAssigns();
             if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_assm");
         }
diff --git a/src/V3GraphAcyc.cpp b/src/V3GraphAcyc.cpp
index a62fd3d9d..0df758ed1 100644
--- a/src/V3GraphAcyc.cpp
+++ b/src/V3GraphAcyc.cpp
@@ -254,7 +254,7 @@ void GraphAcyc::simplify(bool allowCut) {
         if (allowCut) {
             // The main algorithm works without these, though slower
             // So if changing the main algorithm, comment these out for a test run
-            if (v3Global.opt.oAcycSimp()) {
+            if (v3Global.opt.fAcycSimp()) {
                 cutBasic(vertexp);
                 cutBackward(vertexp);
             }
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 2a4c3050d..7d4b1e846 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -1082,7 +1082,28 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
         parseOptsFile(fl, parseFileArg(optdir, valp), false);
     });
     DECL_OPTION("-flatten", OnOff, &m_flatten);
+
+    DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp);
+    DECL_OPTION("-fassemble", FOnOff, &m_fAssemble);
+    DECL_OPTION("-fcase", FOnOff, &m_fCase);
+    DECL_OPTION("-fcombine", FOnOff, &m_fCombine);
+    DECL_OPTION("-fconst", FOnOff, &m_fConst);
+    DECL_OPTION("-fconst-bit-op-tree", FOnOff, &m_fConstBitOpTree);
+    DECL_OPTION("-fdedup", FOnOff, &m_fDedupe);
+    DECL_OPTION("-fexpand", FOnOff, &m_fExpand);
+    DECL_OPTION("-fgate", FOnOff, &m_fGate);
+    DECL_OPTION("-finline", FOnOff, &m_fInline);
+    DECL_OPTION("-flife", FOnOff, &m_fLife);
+    DECL_OPTION("-flife-post", FOnOff, &m_fLifePost);
+    DECL_OPTION("-flocalize", FOnOff, &m_fLocalize);
+    DECL_OPTION("-fmerge-cond", FOnOff, &m_fMergeCond);
     DECL_OPTION("-fmerge-const-pool", FOnOff, &m_fMergeConstPool);
+    DECL_OPTION("-freloop", FOnOff, &m_fReloop);
+    DECL_OPTION("-freorder", FOnOff, &m_fReorder);
+    DECL_OPTION("-fsplit", FOnOff, &m_fSplit);
+    DECL_OPTION("-fsubst", FOnOff, &m_fSubst);
+    DECL_OPTION("-fsubst-const", FOnOff, &m_fSubstConst);
+    DECL_OPTION("-ftable", FOnOff, &m_fTable);
 
     DECL_OPTION("-G", CbPartialMatch, [this](const char* optp) { addParameter(optp, false); });
     DECL_OPTION("-gate-stmts", Set, &m_gateStmts);
@@ -1155,47 +1176,49 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
     DECL_OPTION("-max-num-width", Set, &m_maxNumWidth);
     DECL_OPTION("-mod-prefix", Set, &m_modPrefix);
 
-    DECL_OPTION("-O", CbPartialMatch, [this](const char* optp) {
-        // Optimization
+    DECL_OPTION("-O0", CbCall, [this]() { optimize(0); });
+    DECL_OPTION("-O1", CbCall, [this]() { optimize(1); });
+    DECL_OPTION("-O2", CbCall, [this]() { optimize(2); });
+    DECL_OPTION("-O3", CbCall, [this]() { optimize(3); });
+
+    DECL_OPTION("-O", CbPartialMatch, [this, fl](const char* optp) {
+        // Optimization, e.g. -O1rX
+        // LCOV_EXCL_START
+        fl->v3warn(DEPRECATED, "Option -O<letter> is deprecated. "
+                               "Use -f<optimization> or -fno-<optimization> instead.");
         for (const char* cp = optp; *cp; ++cp) {
             const bool flag = isupper(*cp);
             switch (tolower(*cp)) {
-            case '0': optimize(0); break;  // 0=all off
-            case '1': optimize(1); break;  // 1=all on
-            case '2': optimize(2); break;  // 2=not used
-            case '3': optimize(3); break;  // 3=high
-            case 'a': m_oTable = flag; break;
-            case 'b': m_oCombine = flag; break;
-            case 'c': m_oConst = flag; break;
-            case 'd': m_oDedupe = flag; break;
-            case 'e': m_oCase = flag; break;
-            //    f
-            case 'g': m_oGate = flag; break;
-            //    h
-            case 'i': m_oInline = flag; break;
-            //    j
-            case 'k': m_oSubstConst = flag; break;
-            case 'l': m_oLife = flag; break;
-            case 'm': m_oAssemble = flag; break;
-            //    n
-            case 'o':
-                m_oConstBitOpTree = flag;
-                break;  // Can remove ~2022-01 when stable
-            //    o will be used as an escape for a second character of optimization disables
+            case '0': optimize(0); break;
+            case '1': optimize(1); break;
+            case '2': optimize(2); break;
+            case '3': optimize(3); break;
+            case 'a': m_fTable = flag; break;  // == -fno-table
+            case 'b': m_fCombine = flag; break;  // == -fno-combine
+            case 'c': m_fConst = flag; break;  // == -fno-const
+            case 'd': m_fDedupe = flag; break;  // == -fno-dedup
+            case 'e': m_fCase = flag; break;  // == -fno-case
+            case 'g': m_fGate = flag; break;  // == -fno-gate
+            case 'i': m_fInline = flag; break;  // == -fno-inline
+            case 'k': m_fSubstConst = flag; break;  // == -fno-subst-const
+            case 'l': m_fLife = flag; break;  // == -fno-life
+            case 'm': m_fAssemble = flag; break;  // == -fno-assemble
+            case 'o': m_fConstBitOpTree = flag; break;  // == -fno-const-bit-op-tree
             case 'p':
                 m_public = !flag;
                 break;  // With -Op so flag=0, we want public on so few optimizations done
-            //    q
-            case 'r': m_oReorder = flag; break;
-            case 's': m_oSplit = flag; break;
-            case 't': m_oLifePost = flag; break;
-            case 'u': m_oSubst = flag; break;
-            case 'v': m_oReloop = flag; break;
-            case 'w': m_oMergeCond = flag; break;
-            case 'x': m_oExpand = flag; break;
-            case 'y': m_oAcycSimp = flag; break;
-            case 'z': m_oLocalize = flag; break;
-            default: break;  // No error, just ignore
+            case 'r': m_fReorder = flag; break;  // == -fno-reorder
+            case 's': m_fSplit = flag; break;  // == -fno-split
+            case 't': m_fLifePost = flag; break;  // == -fno-life-post
+            case 'u': m_fSubst = flag; break;  // == -fno-subst
+            case 'v': m_fReloop = flag; break;  // == -fno-reloop
+            case 'w': m_fMergeCond = flag; break;  // == -fno-merge-cond
+            case 'x': m_fExpand = flag; break;  // == -fno-expand
+            case 'y': m_fAcycSimp = flag; break;  // == -fno-acyc-simp
+            case 'z': m_fLocalize = flag; break;  // == -fno-localize
+            default:
+                break;  // No error, just ignore
+                // LCOV_EXCL_STOP
             }
         }
     });
@@ -1787,26 +1810,26 @@ int V3Options::dumpTreeLevel(const string& srcfile_path) {
 void V3Options::optimize(int level) {
     // Set all optimizations to on/off
     const bool flag = level > 0;
-    m_oAcycSimp = flag;
-    m_oAssemble = flag;
-    m_oCase = flag;
-    m_oCombine = flag;
-    m_oConst = flag;
-    m_oConstBitOpTree = flag;
-    m_oDedupe = flag;
-    m_oExpand = flag;
-    m_oGate = flag;
-    m_oInline = flag;
-    m_oLife = flag;
-    m_oLifePost = flag;
-    m_oLocalize = flag;
-    m_oMergeCond = flag;
-    m_oReloop = flag;
-    m_oReorder = flag;
-    m_oSplit = flag;
-    m_oSubst = flag;
-    m_oSubstConst = flag;
-    m_oTable = flag;
+    m_fAcycSimp = flag;
+    m_fAssemble = flag;
+    m_fCase = flag;
+    m_fCombine = flag;
+    m_fConst = flag;
+    m_fConstBitOpTree = flag;
+    m_fDedupe = flag;
+    m_fExpand = flag;
+    m_fGate = flag;
+    m_fInline = flag;
+    m_fLife = flag;
+    m_fLifePost = flag;
+    m_fLocalize = flag;
+    m_fMergeCond = flag;
+    m_fReloop = flag;
+    m_fReorder = flag;
+    m_fSplit = flag;
+    m_fSubst = flag;
+    m_fSubstConst = flag;
+    m_fTable = flag;
     // And set specific optimization levels
     if (level >= 3) {
         m_inlineMult = -1;  // Maximum inlining
diff --git a/src/V3Options.h b/src/V3Options.h
index e1756ab3d..137580c34 100644
--- a/src/V3Options.h
+++ b/src/V3Options.h
@@ -239,7 +239,6 @@ private:
     bool m_dumpDefines = false;     // main switch: --dump-defines
     bool m_dumpTreeAddrids = false; // main switch: --dump-tree-addrids
     bool m_exe = false;             // main switch: --exe
-    bool m_fMergeConstPool = true;  // main switch: --fmerge-const-pool
     bool m_flatten = false;         // main switch: --flatten
     bool m_hierarchical = false;    // main switch: --hierarchical
     bool m_hierChild = false;       // main switch: --hierarchical-child
@@ -341,27 +340,27 @@ private:
     V3LangCode  m_defaultLanguage;      // main switch: --language
 
     // MEMBERS (optimizations)
-    //                          // main switch: -Op: --public
-    bool        m_oAcycSimp;    // main switch: -Oy: acyclic pre-optimizations
-    bool        m_oAssemble;    // main switch: -Om: assign assemble
-    bool        m_oCase;        // main switch: -Oe: case tree conversion
-    bool        m_oCombine;     // main switch: -Ob: common icode packing
-    bool        m_oConst;       // main switch: -Oc: constant folding
-    bool        m_oConstBitOpTree;  // main switch: -Oo: constant bit op tree
-    bool        m_oDedupe;      // main switch: -Od: logic deduplication
-    bool        m_oExpand;      // main switch: -Ox: expansion of C macros
-    bool        m_oGate;        // main switch: -Og: gate wire elimination
-    bool        m_oInline;      // main switch: -Oi: module inlining
-    bool        m_oLife;        // main switch: -Ol: variable lifetime
-    bool        m_oLifePost;    // main switch: -Ot: delayed assignment elimination
-    bool        m_oLocalize;    // main switch: -Oz: convert temps to local variables
-    bool        m_oMergeCond;   // main switch: -Ob: merge conditionals
-    bool        m_oReloop;      // main switch: -Ov: reform loops
-    bool        m_oReorder;     // main switch: -Or: reorder assignments in blocks
-    bool        m_oSplit;       // main switch: -Os: always assignment splitting
-    bool        m_oSubst;       // main switch: -Ou: substitute expression temp values
-    bool        m_oSubstConst;  // main switch: -Ok: final constant substitution
-    bool        m_oTable;       // main switch: -Oa: lookup table creation
+    bool m_fAcycSimp;    // main switch: -fno-acyc-simp: acyclic pre-optimizations
+    bool m_fAssemble;    // main switch: -fno-assemble: assign assemble
+    bool m_fCase;        // main switch: -fno-case: case tree conversion
+    bool m_fCombine;     // main switch: -fno-combine: common icode packing
+    bool m_fConst;       // main switch: -fno-const: constant folding
+    bool m_fConstBitOpTree;  // main switch: -fno-const-bit-op-tree constant bit op tree
+    bool m_fDedupe;      // main switch: -fno-dedupe: logic deduplication
+    bool m_fExpand;      // main switch: -fno-expand: expansion of C macros
+    bool m_fGate;        // main switch: -fno-gate: gate wire elimination
+    bool m_fInline;      // main switch: -fno-inline: module inlining
+    bool m_fLife;        // main switch: -fno-life: variable lifetime
+    bool m_fLifePost;    // main switch: -fno-life-post: delayed assignment elimination
+    bool m_fLocalize;    // main switch: -fno-localize: convert temps to local variables
+    bool m_fMergeCond;   // main switch: -fno-merge-cond: merge conditionals
+    bool m_fMergeConstPool = true;  // main switch: --fmerge-const-pool
+    bool m_fReloop;      // main switch: -fno-reloop: reform loops
+    bool m_fReorder;     // main switch: -fno-reorder: reorder assignments in blocks
+    bool m_fSplit;       // main switch: -fno-split: always assignment splitting
+    bool m_fSubst;       // main switch: -fno-subst: substitute expression temp values
+    bool m_fSubstConst;  // main switch: -fno-subst-const: final constant substitution
+    bool m_fTable;       // main switch: -fno-table: lookup table creation
     // clang-format on
 
     bool m_available = false;  // Set to true at the end of option parsing
@@ -448,7 +447,6 @@ public:
     bool dpiHdrOnly() const { return m_dpiHdrOnly; }
     bool dumpDefines() const { return m_dumpDefines; }
     bool exe() const { return m_exe; }
-    bool fMergeConstPool() const { return m_fMergeConstPool; }
     bool flatten() const { return m_flatten; }
     bool gmake() const { return m_gmake; }
     bool threadsDpiPure() const { return m_threadsDpiPure; }
@@ -575,26 +573,27 @@ public:
     bool isNoClocker(const string& signame) const;
 
     // ACCESSORS (optimization options)
-    bool oAcycSimp() const { return m_oAcycSimp; }
-    bool oAssemble() const { return m_oAssemble; }
-    bool oCase() const { return m_oCase; }
-    bool oCombine() const { return m_oCombine; }
-    bool oConst() const { return m_oConst; }
-    bool oConstBitOpTree() const { return m_oConstBitOpTree; }
-    bool oDedupe() const { return m_oDedupe; }
-    bool oExpand() const { return m_oExpand; }
-    bool oGate() const { return m_oGate; }
-    bool oInline() const { return m_oInline; }
-    bool oLife() const { return m_oLife; }
-    bool oLifePost() const { return m_oLifePost; }
-    bool oLocalize() const { return m_oLocalize; }
-    bool oMergeCond() const { return m_oMergeCond; }
-    bool oReloop() const { return m_oReloop; }
-    bool oReorder() const { return m_oReorder; }
-    bool oSplit() const { return m_oSplit; }
-    bool oSubst() const { return m_oSubst; }
-    bool oSubstConst() const { return m_oSubstConst; }
-    bool oTable() const { return m_oTable; }
+    bool fAcycSimp() const { return m_fAcycSimp; }
+    bool fAssemble() const { return m_fAssemble; }
+    bool fCase() const { return m_fCase; }
+    bool fCombine() const { return m_fCombine; }
+    bool fConst() const { return m_fConst; }
+    bool fConstBitOpTree() const { return m_fConstBitOpTree; }
+    bool fDedupe() const { return m_fDedupe; }
+    bool fExpand() const { return m_fExpand; }
+    bool fGate() const { return m_fGate; }
+    bool fInline() const { return m_fInline; }
+    bool fLife() const { return m_fLife; }
+    bool fLifePost() const { return m_fLifePost; }
+    bool fLocalize() const { return m_fLocalize; }
+    bool fMergeCond() const { return m_fMergeCond; }
+    bool fMergeConstPool() const { return m_fMergeConstPool; }
+    bool fReloop() const { return m_fReloop; }
+    bool fReorder() const { return m_fReorder; }
+    bool fSplit() const { return m_fSplit; }
+    bool fSubst() const { return m_fSubst; }
+    bool fSubstConst() const { return m_fSubstConst; }
+    bool fTable() const { return m_fTable; }
 
     string traceClassBase() const { return m_traceFormat.classBase(); }
     string traceClassLang() const { return m_traceFormat.classBase() + (systemC() ? "Sc" : "C"); }
diff --git a/src/Verilator.cpp b/src/Verilator.cpp
index e233a041c..97e3393c4 100644
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@@ -239,7 +239,7 @@ static void process() {
         // Module inlining
         // Cannot remove dead variables after this, as alias information for final
         // V3Scope's V3LinkDot is in the AstVar.
-        if (v3Global.opt.oInline()) {
+        if (v3Global.opt.fInline()) {
             V3Inline::inlineAll(v3Global.rootp());
             V3LinkDot::linkDotArrayed(v3Global.rootp());  // Cleanup as made new modules
         }
@@ -310,11 +310,11 @@ static void process() {
         // Push constants across variables and remove redundant assignments
         V3Const::constifyAll(v3Global.rootp());
 
-        if (v3Global.opt.oLife()) V3Life::lifeAll(v3Global.rootp());
+        if (v3Global.opt.fLife()) V3Life::lifeAll(v3Global.rootp());
 
         // Make large low-fanin logic blocks into lookup tables
         // This should probably be done much later, once we have common logic elimination.
-        if (!v3Global.opt.lintOnly() && v3Global.opt.oTable()) {
+        if (!v3Global.opt.lintOnly() && v3Global.opt.fTable()) {
             V3Table::tableAll(v3Global.rootp());
         }
 
@@ -328,7 +328,7 @@ static void process() {
         V3Active::activeAll(v3Global.rootp());
 
         // Split single ALWAYS blocks into multiple blocks for better ordering chances
-        if (v3Global.opt.oSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
+        if (v3Global.opt.fSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
         V3SplitAs::splitAsAll(v3Global.rootp());
 
         // Create tracing sample points, before we start eliminating signals
@@ -340,11 +340,11 @@ static void process() {
 
         // Gate-based logic elimination; eliminate signals and push constant across cell boundaries
         // Instant propagation makes lots-o-constant reduction possibilities.
-        if (v3Global.opt.oGate()) {
+        if (v3Global.opt.fGate()) {
             V3Gate::gateAll(v3Global.rootp());
             // V3Gate calls constant propagation itself.
         } else {
-            v3info("Command Line disabled gate optimization with -Og/-O0.  "
+            v3info("Command Line disabled gate optimization with -fno-gate.  "
                    "This may cause ordering problems.");
         }
 
@@ -363,7 +363,7 @@ static void process() {
         }
 
         // Reorder assignments in pipelined blocks
-        if (v3Global.opt.oReorder()) V3Split::splitReorderAll(v3Global.rootp());
+        if (v3Global.opt.fReorder()) V3Split::splitReorderAll(v3Global.rootp());
 
         // Create delayed assignments
         // This creates lots of duplicate ACTIVES so ActiveTop needs to be after this step
@@ -388,11 +388,11 @@ static void process() {
         // Cleanup any dly vars or other temps that are simple assignments
         // Life must be done before Subst, as it assumes each CFunc under
         // _eval is called only once.
-        if (v3Global.opt.oLife()) {
+        if (v3Global.opt.fLife()) {
             V3Const::constifyAll(v3Global.rootp());
             V3Life::lifeAll(v3Global.rootp());
         }
-        if (v3Global.opt.oLifePost()) V3LifePost::lifepostAll(v3Global.rootp());
+        if (v3Global.opt.fLifePost()) V3LifePost::lifepostAll(v3Global.rootp());
 
         // Remove unused vars
         V3Const::constifyAll(v3Global.rootp());
@@ -422,13 +422,13 @@ static void process() {
         v3Global.assertScoped(false);
 
         // Move variables from modules to function local variables where possible
-        if (v3Global.opt.oLocalize()) V3Localize::localizeAll(v3Global.rootp());
+        if (v3Global.opt.fLocalize()) V3Localize::localizeAll(v3Global.rootp());
 
         // Remove remaining scopes; make varrefs/funccalls relative to current module
         V3Descope::descopeAll(v3Global.rootp());
 
         // Icache packing; combine common code in each module's functions into subroutines
-        if (v3Global.opt.oCombine()) V3Combine::combineAll(v3Global.rootp());
+        if (v3Global.opt.fCombine()) V3Combine::combineAll(v3Global.rootp());
     }
 
     V3Error::abortIfErrors();
@@ -452,30 +452,30 @@ static void process() {
     }
 
     // Expand macros and wide operators into C++ primitives
-    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.oExpand()) {
+    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.fExpand()) {
         V3Expand::expandAll(v3Global.rootp());
     }
 
     // Propagate constants across WORDSEL arrayed temporaries
-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubst()) {
         // Constant folding of expanded stuff
         V3Const::constifyCpp(v3Global.rootp());
         V3Subst::substituteAll(v3Global.rootp());
     }
 
-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubstConst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubstConst()) {
         // Constant folding of substitutions
         V3Const::constifyCpp(v3Global.rootp());
         V3Dead::deadifyAll(v3Global.rootp());
     }
 
     if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly()) {
-        if (v3Global.opt.oMergeCond()) {
+        if (v3Global.opt.fMergeCond()) {
             // Merge conditionals
             V3MergeCond::mergeAll(v3Global.rootp());
         }
 
-        if (v3Global.opt.oReloop()) {
+        if (v3Global.opt.fReloop()) {
             // Reform loops to reduce code size
             // Must be after all Sel/array index based optimizations
             V3Reloop::reloopAll(v3Global.rootp());
diff --git a/test_regress/driver.pl b/test_regress/driver.pl
index fbae94f92..541fb296f 100755
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@@ -77,7 +77,6 @@ my $opt_gdbbt;
 my $opt_gdbsim;
 my $opt_hashset;
 my $opt_jobs = 1;
-my $opt_optimize;
 my $opt_quiet;
 my $opt_rerun;
 my $opt_rrsim;
@@ -104,7 +103,6 @@ if (! GetOptions(
           "hashset=s"   => \$opt_hashset,
           "help"        => \&usage,
           "j=i"         => \$opt_jobs,
-          "optimize:s"  => \$opt_optimize,
           "quiet!"      => \$opt_quiet,
           "rerun!"      => \$opt_rerun,
           "rr!"         => \$opt_rr,
@@ -661,7 +659,7 @@ sub new {
         verilator_define => 'VERILATOR',
         verilator_flags => ["-cc",
                             "-Mdir $self->{obj_dir}",
-                            "-OD",  # As currently disabled unless -O3
+                            "--fdedup",  # As currently disabled unless -O3
                             "--debug-check",
                             "--comp-limit-members 10", ],
         verilator_flags2 => [],
@@ -934,19 +932,6 @@ sub compile_vlt_flags {
         $param{make_main} && $param{verilator_make_gmake};
     unshift @verilator_flags, "../" . $self->{main_filename} if
         $param{make_main} && $param{verilator_make_gmake};
-    if (defined $opt_optimize) {
-        my $letters = "";
-        if ($opt_optimize =~ /[a-zA-Z]/) {
-            $letters = $opt_optimize;
-        } else {  # Randomly turn on/off different optimizations
-            foreach my $l ('a' .. 'z') {
-                $letters .= ((rand() > 0.5) ? $l : uc $l);
-            }
-            unshift @verilator_flags, "--trace" if rand() > 0.5;
-            unshift @verilator_flags, "--coverage" if rand() > 0.5;
-        }
-        unshift @verilator_flags, "--O" . $letters;
-    }
 
     my @cmdargs = (
                    "--prefix " . $param{VM_PREFIX},
@@ -2906,11 +2891,6 @@ Displays this message and program version and exits.
 Run number of parallel tests, or 0 to determine the count based on the
 number of cores installed.  Requires Perl's Parallel::Forker package.
 
-=item --optimize
-
-Randomly turn on/off different optimizations.  With specific flags,
-use those optimization settings
-
 =item --quiet
 
 Suppress all output except for failures and progress messages every 15
diff --git a/test_regress/t/t_altera_lpm_mult_noinl.pl b/test_regress/t/t_altera_lpm_mult_noinl.pl
index 2eac39a3a..63f8aa315 100755
--- a/test_regress/t/t_altera_lpm_mult_noinl.pl
+++ b/test_regress/t/t_altera_lpm_mult_noinl.pl
@@ -15,7 +15,7 @@ top_filename("t/t_altera_lpm.v");
 $module =~ s/_noinl//;
 
 compile(
-    verilator_flags2 => ["--top-module ${module}", "-Oi"]
+    verilator_flags2 => ["--top-module ${module}", "-fno-inline"]
     );
 
 ok(1);
diff --git a/test_regress/t/t_alw_noreorder.pl b/test_regress/t/t_alw_noreorder.pl
index 46d021e6b..edc2a6f7b 100755
--- a/test_regress/t/t_alw_noreorder.pl
+++ b/test_regress/t/t_alw_noreorder.pl
@@ -12,7 +12,7 @@ scenarios(vlt_all => 1);
 
 top_filename("t/t_alw_reorder.v");
 compile(
-    verilator_flags2 => ["--stats -Or"],
+    verilator_flags2 => ["--stats -fno-reorder"],
     );
 
 file_grep($Self->{stats}, qr/Optimizations, Split always\s+(\d+)/i, 0);
diff --git a/test_regress/t/t_assign_inline.pl b/test_regress/t/t_assign_inline.pl
index 27414cae0..1683d1777 100755
--- a/test_regress/t/t_assign_inline.pl
+++ b/test_regress/t/t_assign_inline.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["-O0 -OG"],
+    verilator_flags2 => ["-O0 -fgate"],
     );
 
 execute(
diff --git a/test_regress/t/t_assign_slice_overflow_ox.pl b/test_regress/t/t_assign_slice_overflow_ox.pl
index 5251be495..8702b94fe 100755
--- a/test_regress/t/t_assign_slice_overflow_ox.pl
+++ b/test_regress/t/t_assign_slice_overflow_ox.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t_assign_slice_overflow.v");
 
 compile(
-    verilator_flags2 => ["-Ox"],
+    verilator_flags2 => ["-fno-expand"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_66bits_noexpand.pl b/test_regress/t/t_case_66bits_noexpand.pl
index fae2f640f..738da6174 100755
--- a/test_regress/t/t_case_66bits_noexpand.pl
+++ b/test_regress/t/t_case_66bits_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_66bits.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_case_incrdecr.pl b/test_regress/t/t_case_incrdecr.pl
index abbcf936a..729c0cc8a 100755
--- a/test_regress/t/t_case_incrdecr.pl
+++ b/test_regress/t/t_case_incrdecr.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--trace --Os -x-assign 0"],
+    verilator_flags2 => ["--trace --fno-split -x-assign 0"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_write1_noexpand.pl b/test_regress/t/t_case_write1_noexpand.pl
index cadb667e6..48c57c39a 100755
--- a/test_regress/t/t_case_write1_noexpand.pl
+++ b/test_regress/t/t_case_write1_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_write1.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_const_no_opt.pl b/test_regress/t/t_const_no_opt.pl
index 33be39810..79bc15076 100755
--- a/test_regress/t/t_const_no_opt.pl
+++ b/test_regress/t/t_const_no_opt.pl
@@ -13,7 +13,7 @@ top_filename("t/t_const_opt.v");
 
 # Run the same design as t_const_opt.pl without bitopt tree optimization to make sure that the result is same.
 compile(
-    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-Oo", "$Self->{t_dir}/t_const_opt.cpp"],
+    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-fno-const-bit-op-tree", "$Self->{t_dir}/t_const_opt.cpp"],
     );
 
 execute(
diff --git a/test_regress/t/t_emit_constw.pl b/test_regress/t/t_emit_constw.pl
index 9b1487fcd..8f7895804 100755
--- a/test_regress/t/t_emit_constw.pl
+++ b/test_regress/t/t_emit_constw.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ['--Ox'],
+    verilator_flags2 => ['--fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_func_twocall_noexpand.pl b/test_regress/t/t_func_twocall_noexpand.pl
index 001824bc6..452d4b37a 100755
--- a/test_regress/t/t_func_twocall_noexpand.pl
+++ b/test_regress/t/t_func_twocall_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_func_twocall.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_gen_genblk_noinl.pl b/test_regress/t/t_gen_genblk_noinl.pl
index 7574a1cfb..ef537cd4d 100755
--- a/test_regress/t/t_gen_genblk_noinl.pl
+++ b/test_regress/t/t_gen_genblk_noinl.pl
@@ -16,7 +16,7 @@ scenarios(simulator => 1);
 $Self->{sim_time} = 11000;
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_incr_void.pl b/test_regress/t/t_incr_void.pl
index 5b95e5b74..e7d3e18e3 100755
--- a/test_regress/t/t_incr_void.pl
+++ b/test_regress/t/t_incr_void.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--Os -x-assign 0"],
+    verilator_flags2 => ["--fno-split -x-assign 0"],
     );
 
 execute(
diff --git a/test_regress/t/t_inst_slice_noinl.pl b/test_regress/t/t_inst_slice_noinl.pl
index 11f75c752..aa56e6155 100755
--- a/test_regress/t/t_inst_slice_noinl.pl
+++ b/test_regress/t/t_inst_slice_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_inst_slice.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface1_modport_noinl.pl b/test_regress/t/t_interface1_modport_noinl.pl
index b077bef4e..4f4b314ae 100755
--- a/test_regress/t/t_interface1_modport_noinl.pl
+++ b/test_regress/t/t_interface1_modport_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1_modport.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface1_noinl.pl b/test_regress/t/t_interface1_noinl.pl
index 3c9d8d316..867b1e993 100755
--- a/test_regress/t/t_interface1_noinl.pl
+++ b/test_regress/t/t_interface1_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface2_noinl.pl b/test_regress/t/t_interface2_noinl.pl
index 57b72e7a7..cad1b6e3d 100755
--- a/test_regress/t/t_interface2_noinl.pl
+++ b/test_regress/t/t_interface2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface2.v");
 
 compile(
-    verilator_flags2 => ["--top-module t -Oi"],
+    verilator_flags2 => ["--top-module t -fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_array2_noinl.pl b/test_regress/t/t_interface_array2_noinl.pl
index ad389d0fb..7bf1518f5 100755
--- a/test_regress/t/t_interface_array2_noinl.pl
+++ b/test_regress/t/t_interface_array2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_array_noinl.pl b/test_regress/t/t_interface_array_noinl.pl
index 02bf8fd89..df71f77e9 100755
--- a/test_regress/t/t_interface_array_noinl.pl
+++ b/test_regress/t/t_interface_array_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_down_noinl.pl b/test_regress/t/t_interface_down_noinl.pl
index fb03fc988..34ce5cb69 100755
--- a/test_regress/t/t_interface_down_noinl.pl
+++ b/test_regress/t/t_interface_down_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_down.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen10_noinl.pl b/test_regress/t/t_interface_gen10_noinl.pl
index e5c3f22c5..f691c6d0a 100755
--- a/test_regress/t/t_interface_gen10_noinl.pl
+++ b/test_regress/t/t_interface_gen10_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen10.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen11_noinl.pl b/test_regress/t/t_interface_gen11_noinl.pl
index 82a6a9a27..d1e7dd3c0 100755
--- a/test_regress/t/t_interface_gen11_noinl.pl
+++ b/test_regress/t/t_interface_gen11_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen11.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen12_noinl.pl b/test_regress/t/t_interface_gen12_noinl.pl
index c3f59ba19..8ebecd448 100755
--- a/test_regress/t/t_interface_gen12_noinl.pl
+++ b/test_regress/t/t_interface_gen12_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen12.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen2_noinl.pl b/test_regress/t/t_interface_gen2_noinl.pl
index fc7c4bfb1..eb772bab6 100755
--- a/test_regress/t/t_interface_gen2_noinl.pl
+++ b/test_regress/t/t_interface_gen2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen3_noinl.pl b/test_regress/t/t_interface_gen3_noinl.pl
index e49dfc39a..b63c72eb9 100755
--- a/test_regress/t/t_interface_gen3_noinl.pl
+++ b/test_regress/t/t_interface_gen3_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen3.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen4_noinl.pl b/test_regress/t/t_interface_gen4_noinl.pl
index 4a0b00930..e724c2859 100755
--- a/test_regress/t/t_interface_gen4_noinl.pl
+++ b/test_regress/t/t_interface_gen4_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen4.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen5_noinl.pl b/test_regress/t/t_interface_gen5_noinl.pl
index 0873ce9c5..5b4852691 100755
--- a/test_regress/t/t_interface_gen5_noinl.pl
+++ b/test_regress/t/t_interface_gen5_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen5.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen6_noinl.pl b/test_regress/t/t_interface_gen6_noinl.pl
index 4c42c6797..e43d9460a 100755
--- a/test_regress/t/t_interface_gen6_noinl.pl
+++ b/test_regress/t/t_interface_gen6_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen6.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen7_noinl.pl b/test_regress/t/t_interface_gen7_noinl.pl
index 27cb3ea61..458c5f0f6 100755
--- a/test_regress/t/t_interface_gen7_noinl.pl
+++ b/test_regress/t/t_interface_gen7_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen7.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen8_noinl.pl b/test_regress/t/t_interface_gen8_noinl.pl
index ba3b2b132..644d9a10e 100755
--- a/test_regress/t/t_interface_gen8_noinl.pl
+++ b/test_regress/t/t_interface_gen8_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen8.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen9_noinl.pl b/test_regress/t/t_interface_gen9_noinl.pl
index 48f4eb8be..6ac0d6296 100755
--- a/test_regress/t/t_interface_gen9_noinl.pl
+++ b/test_regress/t/t_interface_gen9_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen9.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen_noinl.pl b/test_regress/t/t_interface_gen_noinl.pl
index 5813d42eb..17273106f 100755
--- a/test_regress/t/t_interface_gen_noinl.pl
+++ b/test_regress/t/t_interface_gen_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_inl.pl b/test_regress/t/t_interface_inl.pl
index efb67ed7f..08dfa385c 100755
--- a/test_regress/t/t_interface_inl.pl
+++ b/test_regress/t/t_interface_inl.pl
@@ -14,7 +14,7 @@ top_filename("t/t_interface.v");
 
 compile(
     # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_import_noinl.pl b/test_regress/t/t_interface_modport_import_noinl.pl
index 3821fef11..a9e97bee1 100755
--- a/test_regress/t/t_interface_modport_import_noinl.pl
+++ b/test_regress/t/t_interface_modport_import_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport_import.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_inl.pl b/test_regress/t/t_interface_modport_inl.pl
index 9afcd9cdd..eb2ca2181 100755
--- a/test_regress/t/t_interface_modport_inl.pl
+++ b/test_regress/t/t_interface_modport_inl.pl
@@ -14,7 +14,7 @@ top_filename("t/t_interface_modport.v");
 
 compile(
     # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_noinl.pl b/test_regress/t/t_interface_modport_noinl.pl
index 4c051df1a..7f1015d23 100755
--- a/test_regress/t/t_interface_modport_noinl.pl
+++ b/test_regress/t/t_interface_modport_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_mp_func_noinl.pl b/test_regress/t/t_interface_mp_func_noinl.pl
index 432a7308a..89f4835b5 100755
--- a/test_regress/t/t_interface_mp_func_noinl.pl
+++ b/test_regress/t/t_interface_mp_func_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_mp_func.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_nest_noinl.pl b/test_regress/t/t_interface_nest_noinl.pl
index 9d88a39a0..e042d33c1 100755
--- a/test_regress/t/t_interface_nest_noinl.pl
+++ b/test_regress/t/t_interface_nest_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_nest.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_noinl.pl b/test_regress/t/t_interface_noinl.pl
index 52cb09c98..7be6235ad 100755
--- a/test_regress/t/t_interface_noinl.pl
+++ b/test_regress/t/t_interface_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_twod_noinl.pl b/test_regress/t/t_interface_twod_noinl.pl
index 18f0adf62..e77089cb0 100755
--- a/test_regress/t/t_interface_twod_noinl.pl
+++ b/test_regress/t/t_interface_twod_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_twod.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_lint_setout_bad_noinl.pl b/test_regress/t/t_lint_setout_bad_noinl.pl
index 4b5131821..cbbf96bb4 100755
--- a/test_regress/t/t_lint_setout_bad_noinl.pl
+++ b/test_regress/t/t_lint_setout_bad_noinl.pl
@@ -13,7 +13,7 @@ scenarios(linter => 1);
 top_filename("t/t_lint_setout_bad.v");
 
 lint(
-    verilator_flags2 => ["--lint-only -Oi"],
+    verilator_flags2 => ["--lint-only -fno-inline"],
     fails => 1,
     expect_filename => $Self->{golden_filename},
     );
diff --git a/test_regress/t/t_math_cond_huge_noexpand.pl b/test_regress/t/t_math_cond_huge_noexpand.pl
index 0ae4e3ce4..15399cb9f 100755
--- a/test_regress/t/t_math_cond_huge_noexpand.pl
+++ b/test_regress/t/t_math_cond_huge_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_cond_huge.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_div_noexpand.pl b/test_regress/t/t_math_div_noexpand.pl
index 4dbcba15c..fa7ecd2ec 100755
--- a/test_regress/t/t_math_div_noexpand.pl
+++ b/test_regress/t/t_math_div_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_div.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_eq_noexpand.pl b/test_regress/t/t_math_eq_noexpand.pl
index f8b2375c0..2c3907b70 100755
--- a/test_regress/t/t_math_eq_noexpand.pl
+++ b/test_regress/t/t_math_eq_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_eq.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_red_noexpand.pl b/test_regress/t/t_math_red_noexpand.pl
index 89e54c0c9..655ce0246 100755
--- a/test_regress/t/t_math_red_noexpand.pl
+++ b/test_regress/t/t_math_red_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_red.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_shift_noexpand.pl b/test_regress/t/t_math_shift_noexpand.pl
index e8a59556b..8584197a0 100755
--- a/test_regress/t/t_math_shift_noexpand.pl
+++ b/test_regress/t/t_math_shift_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_shift.v");
 
 compile(
-    verilator_flags2 => ["-Wno-CLKDATA", '-Ox'],
+    verilator_flags2 => ["-Wno-CLKDATA", '-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_signed_noexpand.pl b/test_regress/t/t_math_signed_noexpand.pl
index 336d35594..b086af557 100755
--- a/test_regress/t/t_math_signed_noexpand.pl
+++ b/test_regress/t/t_math_signed_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_signed.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_vliw_noexpand.pl b/test_regress/t/t_math_vliw_noexpand.pl
index fce202e04..5ca1e425f 100755
--- a/test_regress/t/t_math_vliw_noexpand.pl
+++ b/test_regress/t/t_math_vliw_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_vliw.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_multi_io.pl b/test_regress/t/t_mem_multi_io.pl
index 1691d75f1..4e371f1d7 100755
--- a/test_regress/t/t_mem_multi_io.pl
+++ b/test_regress/t/t_mem_multi_io.pl
@@ -12,7 +12,7 @@ scenarios(simulator => 1);
 
 compile(
     # Disable inlining, this test is trivial without it
-    verilator_flags2 => ["-Oi --trace"],
+    verilator_flags2 => ["-fno-inline --trace"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io2_cc.pl b/test_regress/t/t_mem_multi_io2_cc.pl
index 3edda698b..bfd551aed 100755
--- a/test_regress/t/t_mem_multi_io2_cc.pl
+++ b/test_regress/t/t_mem_multi_io2_cc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io2_sc.pl b/test_regress/t/t_mem_multi_io2_sc.pl
index 11ae8cbfc..2fb4bf70c 100755
--- a/test_regress/t/t_mem_multi_io2_sc.pl
+++ b/test_regress/t/t_mem_multi_io2_sc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_multi_io3_cc.pl b/test_regress/t/t_mem_multi_io3_cc.pl
index 4ad019dbf..b6090a775 100755
--- a/test_regress/t/t_mem_multi_io3_cc.pl
+++ b/test_regress/t/t_mem_multi_io3_cc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io3_sc.pl b/test_regress/t/t_mem_multi_io3_sc.pl
index 5825c7845..f37d9dedd 100755
--- a/test_regress/t/t_mem_multi_io3_sc.pl
+++ b/test_regress/t/t_mem_multi_io3_sc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multidim_Ox.pl b/test_regress/t/t_mem_multidim_Ox.pl
index bb4dbc122..ccde0bbbd 100755
--- a/test_regress/t/t_mem_multidim_Ox.pl
+++ b/test_regress/t/t_mem_multidim_Ox.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mem_multidim.v");
 
 compile(
-    verilator_flags2 => ['--Ox'],
+    verilator_flags2 => ['--fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_packed_noexpand.pl b/test_regress/t/t_mem_packed_noexpand.pl
index d5fc2b5da..df4c82d6d 100755
--- a/test_regress/t/t_mem_packed_noexpand.pl
+++ b/test_regress/t/t_mem_packed_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_mem_packed.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array0_noinl.pl b/test_regress/t/t_mod_interface_array0_noinl.pl
index 3c74fd016..56032e0d9 100755
--- a/test_regress/t/t_mod_interface_array0_noinl.pl
+++ b/test_regress/t/t_mod_interface_array0_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array0.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array1_noinl.pl b/test_regress/t/t_mod_interface_array1_noinl.pl
index 34871282a..651bb1c65 100755
--- a/test_regress/t/t_mod_interface_array1_noinl.pl
+++ b/test_regress/t/t_mod_interface_array1_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array1.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array2_noinl.pl b/test_regress/t/t_mod_interface_array2_noinl.pl
index c19612e57..2afa9e020 100755
--- a/test_regress/t/t_mod_interface_array2_noinl.pl
+++ b/test_regress/t/t_mod_interface_array2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array4_noinl.pl b/test_regress/t/t_mod_interface_array4_noinl.pl
index 6797c1016..62ad2ca24 100755
--- a/test_regress/t/t_mod_interface_array4_noinl.pl
+++ b/test_regress/t/t_mod_interface_array4_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array4.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array6_noinl.pl b/test_regress/t/t_mod_interface_array6_noinl.pl
index 5244ac42c..f07ea1917 100755
--- a/test_regress/t/t_mod_interface_array6_noinl.pl
+++ b/test_regress/t/t_mod_interface_array6_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array6.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_optm_if_cond.pl b/test_regress/t/t_optm_if_cond.pl
index b67f09305..7910f570f 100755
--- a/test_regress/t/t_optm_if_cond.pl
+++ b/test_regress/t/t_optm_if_cond.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(vlt => 1);
 
 compile(
-    verilator_flags2 => ['--stats', "-Ow"],
+    verilator_flags2 => ['--stats', "-fno-merge-cond"],
     );
 
 if ($Self->{vlt_all}) {
diff --git a/test_regress/t/t_var_assign_landr_noexpand.pl b/test_regress/t/t_var_assign_landr_noexpand.pl
index cd058334d..e616f77c3 100755
--- a/test_regress/t/t_var_assign_landr_noexpand.pl
+++ b/test_regress/t/t_var_assign_landr_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_var_assign_landr.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(

From 59dc2853e3bc899a079a1022fd955329b41cd769 Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Fri, 3 Jun 2022 21:32:13 -0400
Subject: [PATCH 18/19] Support concat assignment to packed array (#3446).

---
 Changes                                  |  5 ++--
 src/V3AstNodes.h                         |  1 +
 src/V3Width.cpp                          | 31 ++++++++++++++++++--
 test_regress/t/t_concat_unpack.pl        | 21 ++++++++++++++
 test_regress/t/t_concat_unpack.v         | 36 ++++++++++++++++++++++++
 test_regress/t/t_unpacked_concat_bad.out | 19 +------------
 6 files changed, 91 insertions(+), 22 deletions(-)
 create mode 100755 test_regress/t/t_concat_unpack.pl
 create mode 100755 test_regress/t/t_concat_unpack.v

diff --git a/Changes b/Changes
index bfbc7dc92..53560e563 100644
--- a/Changes
+++ b/Changes
@@ -20,10 +20,11 @@ Verilator 4.223 devel
 * Add -f<optimization> options to replace -O<letter> options (#3436).
 * Changed --no-merge-const-pool to -fno-merge-const-pool (#3436).
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
-* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
-* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
 * Support non-ANSI interface port declarations (#3439). [Geza Lore, Shunyao CAD]
+* Support concat assignment to packed array (#3446).
 * Improve conditional merging optimization (#3125). [Geza Lore, Shunyao CAD]
+* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
+* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
 * Fix hang with large case statement optimization (#3405). [Mike Urbach]
 * Fix 'with' operator with type casting (#3387). [xiak95]
 * Fix incorrect conditional merging (#3409). [Raynard Qiao]
diff --git a/src/V3AstNodes.h b/src/V3AstNodes.h
index 404eff633..118666c0b 100644
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@@ -8538,6 +8538,7 @@ public:
     AstNodeDType* childDTypep() const { return VN_AS(op1p(), NodeDType); }
     void childDTypep(AstNodeDType* nodep) { setOp1p(nodep); }
     AstNode* itemsp() const { return op2p(); }  // op2 = AstPatReplicate, AstPatMember, etc
+    void addItemsp(AstNode* nodep) { addOp2p(nodep); }
 };
 class AstPatMember final : public AstNodeMath {
     // Verilog '{a} or '{a{b}}
diff --git a/src/V3Width.cpp b/src/V3Width.cpp
index 69cee08f6..15bdf6f33 100644
--- a/src/V3Width.cpp
+++ b/src/V3Width.cpp
@@ -504,6 +504,7 @@ private:
         //   width: LHS + RHS
         AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
         userIterate(vdtypep, WidthVP(SELF, BOTH).p());
+        // Conversions
         if (VN_IS(vdtypep, QueueDType)) {
             // Queue "element 0" is lhsp, so we need to swap arguments
             auto* const newp = new AstConsQueue(nodep->fileline(), nodep->rhsp()->unlinkFrBack(),
@@ -521,6 +522,16 @@ private:
             userIterateChildren(newp, m_vup);
             return;
         }
+        if (VN_IS(vdtypep, UnpackArrayDType)) {
+            auto* const newp = new AstPattern{nodep->fileline(), nullptr};
+            patConcatConvertRecurse(newp, nodep);
+            nodep->replaceWith(newp);
+            VL_DO_DANGLING(pushDeletep(nodep), nodep);
+            userIterate(newp, m_vup);
+            return;
+        }
+
+        // Concat handling
         if (m_vup->prelim()) {
             if (VN_IS(vdtypep, AssocArrayDType)  //
                 || VN_IS(vdtypep, DynArrayDType)  //
@@ -662,7 +673,8 @@ private:
             }
 
             AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
-            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)) {
+            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)
+                || VN_IS(vdtypep, UnpackArrayDType)) {
                 if (times != 1)
                     nodep->v3warn(E_UNSUPPORTED, "Unsupported: Non-1 replication to form "
                                                      << vdtypep->prettyDTypeNameQ()
@@ -674,7 +686,7 @@ private:
                 VL_DO_DANGLING(pushDeletep(nodep), nodep);
                 return;
             }
-            if (VN_IS(vdtypep, AssocArrayDType) || VN_IS(vdtypep, UnpackArrayDType)) {
+            if (VN_IS(vdtypep, AssocArrayDType)) {
                 nodep->v3warn(E_UNSUPPORTED, "Unsupported: Replication to form "
                                                  << vdtypep->prettyDTypeNameQ() << " data type");
             }
@@ -6231,6 +6243,21 @@ private:
         return patmap;
     }
 
+    void patConcatConvertRecurse(AstPattern* patternp, AstConcat* nodep) {
+        if (AstConcat* lhsp = VN_CAST(nodep->lhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, lhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->lhsp()->fileline(),
+                                                 nodep->lhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+        if (AstConcat* rhsp = VN_CAST(nodep->rhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, rhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->rhsp()->fileline(),
+                                                 nodep->rhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+    }
+
     void makeOpenArrayShell(AstNodeFTaskRef* nodep) {
         UINFO(4, "Replicate openarray function " << nodep->taskp() << endl);
         AstNodeFTask* const oldTaskp = nodep->taskp();
diff --git a/test_regress/t/t_concat_unpack.pl b/test_regress/t/t_concat_unpack.pl
new file mode 100755
index 000000000..1aa73f80a
--- /dev/null
+++ b/test_regress/t/t_concat_unpack.pl
@@ -0,0 +1,21 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2022 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(simulator => 1);
+
+compile(
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_concat_unpack.v b/test_regress/t/t_concat_unpack.v
new file mode 100755
index 000000000..8d3f4bac2
--- /dev/null
+++ b/test_regress/t/t_concat_unpack.v
@@ -0,0 +1,36 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Wilson Snyder.
+// SPDX-License-Identifier: CC0-1.0
+
+module t(/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   wire [31:0] arr [0:7];
+   assign arr[0:7] = {
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000},
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000}
+                      };
+
+   int cyc = 0;
+
+   always @(posedge clk) begin
+      cyc <= cyc + 1;
+      if (cyc == 9) begin
+         if (arr[0] !== 32'hffff0000) $stop;
+         if (arr[7] !== 32'ha0a00000) $stop;
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+endmodule
diff --git a/test_regress/t/t_unpacked_concat_bad.out b/test_regress/t/t_unpacked_concat_bad.out
index 4c89adfe6..1482e7507 100644
--- a/test_regress/t/t_unpacked_concat_bad.out
+++ b/test_regress/t/t_unpacked_concat_bad.out
@@ -1,23 +1,6 @@
-%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type
+%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Non-1 replication to form 'bit[31:0]$[1:0]' data type
                                                    : ... In instance t
    17 |    localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}};
       |                                              ^
                     ... For error description see https://verilator.org/warn/UNSUPPORTED?v=latest
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:17:47: Unsized numbers/parameters not allowed in replications.
-                                                     : ... In instance t
-   17 |    localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}};
-      |                                               ^~~~~
-                      ... Use "/* verilator lint_off WIDTHCONCAT */" and lint_on around source to disable this message.
-%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:18:45: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type
-                                                   : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                             ^
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:46: Unsized numbers/parameters not allowed in concatenations.
-                                                     : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                              ^~~~~
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:60: Unsized numbers/parameters not allowed in replications.
-                                                     : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                                            ^
 %Error: Exiting due to

From 67f7432dd7323374023306e71cb11c25fd70e55f Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Sat, 4 Jun 2022 08:37:42 -0400
Subject: [PATCH 19/19] Commentary (#3436).

---
 bin/verilator                   |  2 +-
 docs/guide/deprecations.rst     |  6 ++---
 docs/guide/exe_verilator.rst    | 42 ++++++++++++++++-----------------
 test_regress/t/t_case_write1.pl |  2 +-
 test_regress/t/t_case_write2.pl |  2 +-
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/bin/verilator b/bin/verilator
index f04e2a593..a50e353bb 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -319,7 +319,7 @@ detailed descriptions of these arguments.
      -f <file>                  Parse arguments from a file
      -FI <file>                 Force include of a file
     --flatten                   Force inlining of all modules, tasks and functions
-    --fno-<optimization>        Disable internal optimization stage
+     -fno-<optimization>        Disable internal optimization stage
      -G<name>=<value>           Overwrite top-level parameter
     --gdb                       Run Verilator under GDB interactively
     --gdbbt                     Run Verilator under GDB for backtrace
diff --git a/docs/guide/deprecations.rst b/docs/guide/deprecations.rst
index 4c2d96592..8c0038453 100644
--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@@ -20,9 +20,9 @@ Option `--cdc`
   The experimental `--cdc` option is believed to be generally unused and is
   planned for removal no sooner than January 2023.
 
-Option `--O<letter>`
-  The debug `--O<letter>` options have been replaced with
-  `--fno-<optimization>` debug options to match GCC. The old options are
+Option `-O<letter>`
+  The debug `-O<letter>` options have been replaced with
+  `-fno-<optimization>` debug options to match GCC. The old options are
   planned for removal no sooner than June 2023.
 
 Option `--prof-threads`
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index fda5aedb3..af65fe3ba 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -431,47 +431,47 @@ Summary:
    flattening large designs may require significant CPU time, memory and
    storage.
 
-.. option:: --fno-acyc-simp
+.. option:: -fno-acyc-simp
 
-.. option:: --fno-assemble
+.. option:: -fno-assemble
 
-.. option:: --fno-case
+.. option:: -fno-case
 
-.. option:: --fno-combine
+.. option:: -fno-combine
 
-.. option:: --fno-const
+.. option:: -fno-const
 
-.. option:: --fno-const-bit-op-tree
+.. option:: -fno-const-bit-op-tree
 
-.. option:: --fno-dedup
+.. option:: -fno-dedup
 
-.. option:: --fno-expand
+.. option:: -fno-expand
 
-.. option:: --fno-gate
+.. option:: -fno-gate
 
-.. option:: --fno-inline
+.. option:: -fno-inline
 
-.. option:: --fno-life
+.. option:: -fno-life
 
-.. option:: --fno-life-post
+.. option:: -fno-life-post
 
-.. option:: --fno-localize
+.. option:: -fno-localize
 
-.. option:: --fno-merge-cond
+.. option:: -fno-merge-cond
 
-.. option:: --fno-merge-const-pool
+.. option:: -fno-merge-const-pool
 
-.. option:: --fno-reloop
+.. option:: -fno-reloop
 
-.. option:: --fno-reorder
+.. option:: -fno-reorder
 
-.. option:: --fno-split
+.. option:: -fno-split
 
-.. option:: --fno-subst
+.. option:: -fno-subst
 
-.. option:: --fno-subst-const
+.. option:: -fno-subst-const
 
-.. option:: --fno-table
+.. option:: -fno-table
 
    Rarely needed. Disables one of the internal optimization steps. These
    are typically used only when recommended by a maintainer to help debug
diff --git a/test_regress/t/t_case_write1.pl b/test_regress/t/t_case_write1.pl
index 4fa36576d..33e2bb517 100755
--- a/test_regress/t/t_case_write1.pl
+++ b/test_regress/t/t_case_write1.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_write2.pl b/test_regress/t/t_case_write2.pl
index 4fa36576d..33e2bb517 100755
--- a/test_regress/t/t_case_write2.pl
+++ b/test_regress/t/t_case_write2.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
     );
 
 execute(