diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp index 16e7d612c..b1e2edb21 100644 --- a/src/V3Partition.cpp +++ b/src/V3Partition.cpp @@ -1256,12 +1256,18 @@ private: PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator + LogicMTask* const m_entryMTaskp; // Singular source vertex of the dependency graph + LogicMTask* const m_exitMTaskp; // Singular sink vertex of the dependency graph + public: // CONSTRUCTORS - PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts) + PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, LogicMTask* entryMTaskp, + LogicMTask* exitMTaskp, bool slowAsserts) : m_mtasksp{mtasksp} , m_scoreLimit{scoreLimit} - , m_slowAsserts{slowAsserts} {} + , m_slowAsserts{slowAsserts} + , m_entryMTaskp{entryMTaskp} + , m_exitMTaskp{exitMTaskp} {} // METHODS void go() { @@ -1377,6 +1383,16 @@ public: continue; } + // Avoid merging the entry/exit nodes. This would create serialization, by forcing the + // merged MTask to run before/after everything else. Empirically this helps + // performance in a modest way by allowing other MTasks to start earlier. + if (MTaskEdge* const edgep = mergeCanp->toMTaskEdge()) { + if (edgep->fromp() == m_entryMTaskp || edgep->top() == m_exitMTaskp) { + m_sb.remove(mergeCanp); + continue; + } + } + // Avoid merging any edge that would create a cycle. // // For example suppose we begin with vertices A, B, C and edges @@ -1742,7 +1758,7 @@ private: // slowAsserts. PartContraction ec{&mtasks, // Any CP limit >chain_len should work: - chain_len * 2, false /* slowAsserts */}; + chain_len * 2, nullptr, nullptr, false /* slowAsserts */}; ec.go(); PartParallelismEst check{&mtasks}; @@ -1796,7 +1812,7 @@ private: } partInitCriticalPaths(&mtasks); - PartContraction{&mtasks, 20, true}.go(); + PartContraction{&mtasks, 20, nullptr, nullptr, true}.go(); PartParallelismEst check{&mtasks}; check.traverse(); @@ -2644,7 +2660,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { // Artificial single entry point vertex in the MTask graph to allow sibling merges. // This is required as otherwise disjoint sub-graphs could not be merged, but the // coarsening algorithm assumes that the graph is connected. - LogicMTask* const entryMTask = new LogicMTask{mtasksp, nullptr}; + m_entryMTaskp = new LogicMTask{mtasksp, nullptr}; // The V3InstrCount within LogicMTask will set user5 on each AST // node, to assert that we never count any node twice. @@ -2665,7 +2681,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { // Artificial single exit point vertex in the MTask graph to allow sibling merges. // this enables merging MTasks with no downstream dependents if that is the ideal merge. - LogicMTask* const exitMTask = new LogicMTask{mtasksp, nullptr}; + m_exitMTaskp = new LogicMTask{mtasksp, nullptr}; // Create the mtask->mtask dependency edges based on the dependencies between MTaskMoveVertex // vertices. @@ -2674,7 +2690,7 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { LogicMTask* const mtaskp = static_cast(vtxp); // Entry and exit vertices handled separately - if (VL_UNLIKELY((mtaskp == entryMTask) || (mtaskp == exitMTask))) continue; + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; // At this point, there should only be one MTaskMoveVertex per LogicMTask UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex"); @@ -2714,11 +2730,11 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { nextp = vtxp->verticesNextp(); LogicMTask* const mtaskp = static_cast(vtxp); - if (VL_UNLIKELY((mtaskp == entryMTask) || (mtaskp == exitMTask))) continue; + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; // Add the entry/exit edges - if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, entryMTask, mtaskp, 1}; - if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, exitMTask, 1}; + if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, m_entryMTaskp, mtaskp, 1}; + if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, m_exitMTaskp, 1}; } return totalGraphCost; @@ -2786,7 +2802,7 @@ void V3Partition::go(V3Graph* mtasksp) { // Some tests disable this, hence the test on threadsCoarsen(). // Coarsening is always enabled in production. if (v3Global.opt.threadsCoarsen()) { - PartContraction{mtasksp, cpLimit, + PartContraction{mtasksp, cpLimit, m_entryMTaskp, m_exitMTaskp, // --debugPartition is used by tests // to enable slow assertions. v3Global.opt.debugPartition()} diff --git a/src/V3Partition.h b/src/V3Partition.h index caac97371..5aba6b009 100644 --- a/src/V3Partition.h +++ b/src/V3Partition.h @@ -41,6 +41,10 @@ class V3Partition final { // MEMBERS const OrderGraph* const m_orderGraphp; // The OrderGraph const V3Graph* const m_fineDepsGraphp; // Fine-grained dependency graph + + LogicMTask* m_entryMTaskp = nullptr; // Singular source vertex of the dependency graph + LogicMTask* m_exitMTaskp = nullptr; // Singular sink vertex of the dependency graph + public: // CONSTRUCTORS explicit V3Partition(const OrderGraph* orderGraphp, const V3Graph* fineDepsGraphp)