diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp
index 16e7d612c..b1e2edb21 100644
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@@ -1256,12 +1256,18 @@ private:
     PartPropagateCp<GraphWay::FORWARD> m_forwardPropagator{m_slowAsserts};  // Forward propagator
     PartPropagateCp<GraphWay::REVERSE> m_reversePropagator{m_slowAsserts};  // Reverse propagator
 
+    LogicMTask* const m_entryMTaskp;  // Singular source vertex of the dependency graph
+    LogicMTask* const m_exitMTaskp;  // Singular sink vertex of the dependency graph
+
 public:
     // CONSTRUCTORS
-    PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts)
+    PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, LogicMTask* entryMTaskp,
+                    LogicMTask* exitMTaskp, bool slowAsserts)
         : m_mtasksp{mtasksp}
         , m_scoreLimit{scoreLimit}
-        , m_slowAsserts{slowAsserts} {}
+        , m_slowAsserts{slowAsserts}
+        , m_entryMTaskp{entryMTaskp}
+        , m_exitMTaskp{exitMTaskp} {}
 
     // METHODS
     void go() {
@@ -1377,6 +1383,16 @@ public:
                 continue;
             }
 
+            // Avoid merging the entry/exit nodes. This would create serialization, by forcing the
+            // merged MTask to run before/after everything else. Empirically this helps
+            // performance in a modest way by allowing other MTasks to start earlier.
+            if (MTaskEdge* const edgep = mergeCanp->toMTaskEdge()) {
+                if (edgep->fromp() == m_entryMTaskp || edgep->top() == m_exitMTaskp) {
+                    m_sb.remove(mergeCanp);
+                    continue;
+                }
+            }
+
             // Avoid merging any edge that would create a cycle.
             //
             // For example suppose we begin with vertices A, B, C and edges
@@ -1742,7 +1758,7 @@ private:
         // slowAsserts.
         PartContraction ec{&mtasks,
                            // Any CP limit >chain_len should work:
-                           chain_len * 2, false /* slowAsserts */};
+                           chain_len * 2, nullptr, nullptr, false /* slowAsserts */};
         ec.go();
 
         PartParallelismEst check{&mtasks};
@@ -1796,7 +1812,7 @@ private:
         }
 
         partInitCriticalPaths(&mtasks);
-        PartContraction{&mtasks, 20, true}.go();
+        PartContraction{&mtasks, 20, nullptr, nullptr, true}.go();
 
         PartParallelismEst check{&mtasks};
         check.traverse();
@@ -2644,7 +2660,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
     // Artificial single entry point vertex in the MTask graph to allow sibling merges.
     // This is required as otherwise disjoint sub-graphs could not be merged, but the
     // coarsening algorithm assumes that the graph is connected.
-    LogicMTask* const entryMTask = new LogicMTask{mtasksp, nullptr};
+    m_entryMTaskp = new LogicMTask{mtasksp, nullptr};
 
     // The V3InstrCount within LogicMTask will set user5 on each AST
     // node, to assert that we never count any node twice.
@@ -2665,7 +2681,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
 
     // Artificial single exit point vertex in the MTask graph to allow sibling merges.
     // this enables merging MTasks with no downstream dependents if that is the ideal merge.
-    LogicMTask* const exitMTask = new LogicMTask{mtasksp, nullptr};
+    m_exitMTaskp = new LogicMTask{mtasksp, nullptr};
 
     // Create the mtask->mtask dependency edges based on the dependencies between MTaskMoveVertex
     // vertices.
@@ -2674,7 +2690,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
         LogicMTask* const mtaskp = static_cast<LogicMTask*>(vtxp);
 
         // Entry and exit vertices handled separately
-        if (VL_UNLIKELY((mtaskp == entryMTask) || (mtaskp == exitMTask))) continue;
+        if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue;
 
         // At this point, there should only be one MTaskMoveVertex per LogicMTask
         UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex");
@@ -2714,11 +2730,11 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
         nextp = vtxp->verticesNextp();
         LogicMTask* const mtaskp = static_cast<LogicMTask*>(vtxp);
 
-        if (VL_UNLIKELY((mtaskp == entryMTask) || (mtaskp == exitMTask))) continue;
+        if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue;
 
         // Add the entry/exit edges
-        if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, entryMTask, mtaskp, 1};
-        if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, exitMTask, 1};
+        if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, m_entryMTaskp, mtaskp, 1};
+        if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, m_exitMTaskp, 1};
     }
 
     return totalGraphCost;
@@ -2786,7 +2802,7 @@ void V3Partition::go(V3Graph* mtasksp) {
     // Some tests disable this, hence the test on threadsCoarsen().
     // Coarsening is always enabled in production.
     if (v3Global.opt.threadsCoarsen()) {
-        PartContraction{mtasksp, cpLimit,
+        PartContraction{mtasksp, cpLimit, m_entryMTaskp, m_exitMTaskp,
                         // --debugPartition is used by tests
                         // to enable slow assertions.
                         v3Global.opt.debugPartition()}
diff --git a/src/V3Partition.h b/src/V3Partition.h
index caac97371..5aba6b009 100644
--- a/src/V3Partition.h
+++ b/src/V3Partition.h
@@ -41,6 +41,10 @@ class V3Partition final {
     // MEMBERS
     const OrderGraph* const m_orderGraphp;  // The OrderGraph
     const V3Graph* const m_fineDepsGraphp;  // Fine-grained dependency graph
+
+    LogicMTask* m_entryMTaskp = nullptr;  // Singular source vertex of the dependency graph
+    LogicMTask* m_exitMTaskp = nullptr;  // Singular sink vertex of the dependency graph
+
 public:
     // CONSTRUCTORS
     explicit V3Partition(const OrderGraph* orderGraphp, const V3Graph* fineDepsGraphp)