From 875361d7cee7a102b1a79a91622c14d3ec45d80a Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Thu, 1 Sep 2022 17:29:40 +0200
Subject: [PATCH] V3Partition: Reduce working set size of PartContraction
 (#3587)

This yields an additional 25% speedup of MT scheduling.
---
 src/V3Order.cpp     | 55 ++++++++++++++++---------------
 src/V3Partition.cpp | 80 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 95 insertions(+), 40 deletions(-)
diff --git a/src/V3Order.cpp b/src/V3Order.cpp
index d2bc97118..678dafa78 100644
--- a/src/V3Order.cpp
+++ b/src/V3Order.cpp
@@ -1334,37 +1334,38 @@ void OrderProcess::processMTasks() {
     const V3GraphVertex* moveVxp;
     while ((moveVxp = emit_logic.nextp())) {
         const MTaskMoveVertex* const movep = dynamic_cast<const MTaskMoveVertex*>(moveVxp);
+        // Only care about logic vertices
+        if (!movep->logicp()) continue;
+
         const unsigned mtaskId = movep->color();
         UASSERT(mtaskId > 0, "Every MTaskMoveVertex should have an mtask assignment >0");
-        if (movep->logicp()) {
-            // Add this logic to the per-mtask order
-            mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
 
-            // Since we happen to be iterating over every logic node,
-            // take this opportunity to annotate each AstVar with the id's
-            // of mtasks that consume it and produce it. We'll use this
-            // information in V3EmitC when we lay out var's in memory.
-            const OrderLogicVertex* const logicp = movep->logicp();
-            for (const V3GraphEdge* edgep = logicp->inBeginp(); edgep; edgep = edgep->inNextp()) {
-                const OrderVarVertex* const pre_varp
-                    = dynamic_cast<const OrderVarVertex*>(edgep->fromp());
-                if (!pre_varp) continue;
-                AstVar* const varp = pre_varp->vscp()->varp();
-                // varp depends on logicp, so logicp produces varp,
-                // and vice-versa below
-                varp->addProducingMTaskId(mtaskId);
-            }
-            for (const V3GraphEdge* edgep = logicp->outBeginp(); edgep;
-                 edgep = edgep->outNextp()) {
-                const OrderVarVertex* const post_varp
-                    = dynamic_cast<const OrderVarVertex*>(edgep->top());
-                if (!post_varp) continue;
-                AstVar* const varp = post_varp->vscp()->varp();
-                varp->addConsumingMTaskId(mtaskId);
-            }
-            // TODO? We ignore IO vars here, so those will have empty mtask
-            // signatures. But we could also give those mtask signatures.
+        // Add this logic to the per-mtask order
+        mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
+
+        // Since we happen to be iterating over every logic node,
+        // take this opportunity to annotate each AstVar with the id's
+        // of mtasks that consume it and produce it. We'll use this
+        // information in V3EmitC when we lay out var's in memory.
+        const OrderLogicVertex* const logicp = movep->logicp();
+        for (const V3GraphEdge* edgep = logicp->inBeginp(); edgep; edgep = edgep->inNextp()) {
+            const OrderVarVertex* const pre_varp
+                = dynamic_cast<const OrderVarVertex*>(edgep->fromp());
+            if (!pre_varp) continue;
+            AstVar* const varp = pre_varp->vscp()->varp();
+            // varp depends on logicp, so logicp produces varp,
+            // and vice-versa below
+            varp->addProducingMTaskId(mtaskId);
         }
+        for (const V3GraphEdge* edgep = logicp->outBeginp(); edgep; edgep = edgep->outNextp()) {
+            const OrderVarVertex* const post_varp
+                = dynamic_cast<const OrderVarVertex*>(edgep->top());
+            if (!post_varp) continue;
+            AstVar* const varp = post_varp->vscp()->varp();
+            varp->addConsumingMTaskId(mtaskId);
+        }
+        // TODO? We ignore IO vars here, so those will have empty mtask
+        // signatures. But we could also give those mtask signatures.
     }
 
     // Create the AstExecGraph node which represents the execution
diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp
index 479a91fd6..e2d70360e 100644
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@@ -2611,6 +2611,42 @@ void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) {
     UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl);
 }
 
+// Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask
+// graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph of:
+// - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex
+//      (MTaskMoveVertex::logicp() != nullptr)
+// - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair
+// Our goal is to order the logic vertices. The second type of variable/domain vertices only carry
+// dependencies and are eventually discarded. In order to reduce the working set size of
+// PartContraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, and
+// instead add the transitive dependencies directly, but only if adding the transitive edges
+// directly does not require more dependency edges than keeping the intermediate vertex. That is,
+// we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be true if fanIn
+// or fanOut are 1, or if they are both 2. This can cause significant reduction in working set
+// size.
+static bool bypassOk(MTaskMoveVertex* mvtxp) {
+    // Need to keep all logic vertices
+    if (mvtxp->logicp()) return false;
+    // Count fan-in, up to 3
+    unsigned fanIn = 0;
+    for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) {
+        if (++fanIn == 3) break;
+    }
+    UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn"););
+    // If fanInn no more than one, bypass
+    if (fanIn <= 1) return true;
+    // Count fan-out, up to 3
+    unsigned fanOut = 0;
+    for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) {
+        if (++fanOut == 3) break;
+    }
+    UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut"););
+    // If fan-out no more than one, bypass
+    if (fanOut <= 1) return true;
+    // They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2)
+    return fanIn + fanOut == 4;
+}
+
 uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
     uint32_t totalGraphCost = 0;
 
@@ -2627,9 +2663,13 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
     for (V3GraphVertex *vtxp = m_fineDepsGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) {
         nextp = vtxp->verticesNextp();
         MTaskMoveVertex* const mVtxp = static_cast<MTaskMoveVertex*>(vtxp);
-        LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp};
-        mVtxp->userp(mtaskp);
-        totalGraphCost += mtaskp->cost();
+        if (bypassOk(mVtxp)) {
+            mVtxp->userp(nullptr);  // Set to nullptr to mark as bypassed
+        } else {
+            LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp};
+            mVtxp->userp(mtaskp);
+            totalGraphCost += mtaskp->cost();
+        }
     }
 
     // Artificial single exit point vertex in the MTask graph to allow sibling merges.
@@ -2647,20 +2687,34 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
 
         // At this point, there should only be one MTaskMoveVertex per LogicMTask
         UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex");
-
         MTaskMoveVertex* const mvtxp = mtaskp->vertexListp()->front();
-        for (V3GraphEdge* outp = mvtxp->outBeginp(); outp; outp = outp->outNextp()) {
-            UASSERT(outp->weight() > 0, "Dependency with 0 weight in Move graph");
+        UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask");
 
-            // Grab the opposite end MTask.
-            LogicMTask* const otherp = static_cast<LogicMTask*>(outp->top()->userp());
+        // Function to add a edge to a dependent from 'mtaskp'
+        const auto addEdge = [mtasksp, mtaskp](LogicMTask* otherp) {
             UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge");
-
-            // Don't create redundant edges.
-            if (mtaskp->hasRelativeMTask(otherp)) continue;
-
-            // Add the MTask->MTask dependency edge
+            if (mtaskp->hasRelativeMTask(otherp)) return;  // Don't create redundant edges.
             new MTaskEdge{mtasksp, mtaskp, otherp, 1};
+        };
+
+        // Iterate downstream direct dependents
+        for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) {
+            dNextp = dEdgep->outNextp();
+            V3GraphVertex* const top = dEdgep->top();
+            if (LogicMTask* const otherp = static_cast<LogicMTask*>(top->userp())) {
+                // The opposite end of the edge is not a bypassed vertex, add as direct dependent
+                addEdge(otherp);
+            } else {
+                // The opposite end of the edge is a bypassed vertex, add transitive dependents
+                for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; tEdgep = tNextp) {
+                    tNextp = tEdgep->outNextp();
+                    LogicMTask* const transp = static_cast<LogicMTask*>(tEdgep->top()->userp());
+                    // The Move graph is bipartite (logic <-> var), and logic is never bypassed,
+                    // hence 'transp' must be non nullptr.
+                    UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex");
+                    addEdge(transp);
+                }
+            }
         }
     }