From 2247e1e345291cd9baff94f47c8337e9ca5fb6b3 Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Sun, 10 Mar 2024 18:15:45 +0000 Subject: [PATCH] Cleanup/simplify V3OrderParallel (#4959) No functional change. --- src/V3AstNodeOther.h | 3 +- src/V3OrderInternal.h | 4 +- src/V3OrderParallel.cpp | 1319 ++++++++++++++++++--------------------- src/V3PartitionGraph.h | 41 +- 4 files changed, 631 insertions(+), 736 deletions(-) diff --git a/src/V3AstNodeOther.h b/src/V3AstNodeOther.h index 3b621ff42..4a745a0c8 100644 --- a/src/V3AstNodeOther.h +++ b/src/V3AstNodeOther.h @@ -2100,8 +2100,7 @@ public: m_name = name; } static AstVar* scVarRecurse(AstNode* nodep); - void addProducingMTaskId(int id) { m_mtaskIds.insert(id); } - void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); } + void addMTaskId(int id) { m_mtaskIds.insert(id); } const MTaskIdSet& mtaskIds() const { return m_mtaskIds; } void pinNum(int id) { m_pinNum = id; } int pinNum() const { return m_pinNum; } diff --git a/src/V3OrderInternal.h b/src/V3OrderInternal.h index 2922b7085..a35a4199e 100644 --- a/src/V3OrderInternal.h +++ b/src/V3OrderInternal.h @@ -52,12 +52,12 @@ void processDomains(AstNetlist* netlistp, // const TrigToSenMap& trigToSen, // const ExternalDomainsProvider& externalDomains); -std::vector createSerial(const OrderGraph& graph, // +std::vector createSerial(const OrderGraph& orderGraph, // const std::string& tag, // const TrigToSenMap& trigToSenMap, // bool slow); -AstExecGraph* createParallel(const OrderGraph& graph, // +AstExecGraph* createParallel(const OrderGraph& orderGraph, // const std::string& tag, // const TrigToSenMap& trigToSenMap, // bool slow); diff --git a/src/V3OrderParallel.cpp b/src/V3OrderParallel.cpp index 7eff76e0a..0a4e8a7f7 100644 --- a/src/V3OrderParallel.cpp +++ b/src/V3OrderParallel.cpp @@ -45,7 +45,6 @@ VL_DEFINE_DEBUG_FUNCTIONS; -class LogicMTask; class MTaskEdge; class MergeCandidate; class SiblingMC; @@ -53,10 +52,7 @@ class SiblingMC; // Similar to OrderMoveVertex, but modified for threaded code generation. class MTaskMoveVertex final : public V3GraphVertex { VL_RTTI_IMPL(MTaskMoveVertex, V3GraphVertex) - // This could be more compact, since we know m_varp and m_logicp - // cannot both be set. Each MTaskMoveVertex represents a logic node - // or a var node, it can't be both. - OrderLogicVertex* const m_logicp; // Logic represented by this vertex + OrderLogicVertex* const m_logicp; // Logic represented by this vertex, or nullptr if variable const AstSenTree* const m_domainp; public: @@ -71,68 +67,22 @@ public: const AstScope* scopep() const { return m_logicp ? m_logicp->scopep() : nullptr; } const AstSenTree* domainp() const { return m_domainp; } - string dotColor() const override { - if (logicp()) { - return logicp()->dotColor(); - } else { - return "yellow"; - } - } + string dotColor() const override { return logicp() ? logicp()->dotColor() : "yellow"; } string name() const override { - string nm; - if (logicp()) { - nm = logicp()->name(); - nm += (string{"\\nMV:"} + " d=" + cvtToHex(logicp()->domainp()) + " s=" - + cvtToHex(logicp()->scopep()) - // "color()" represents the mtask ID. - + "\\nt=" + cvtToStr(color())); + std::string nm; + if (!logicp()) { + nm = "var"; } else { - nm = "nolog\\nt=" + cvtToStr(color()); + nm = logicp()->name() + "\\n"; + nm += "MV:"; + nm += +" d=" + cvtToHex(logicp()->domainp()); + nm += +" s=" + cvtToHex(logicp()->scopep()); } + nm += "\nt=" + std::to_string(color()); // "color()" represents the mtask ID. return nm; } }; -//************************************************************************* -// V3Partition takes the fine-grained logic graph from V3Order and -// collapses it into a coarse-grained graph of AbstractLogicMTask's, each -// of which contains of set of the logic nodes from the fine-grained -// graph. - -class V3Partition final { - // MEMBERS - const OrderGraph* const m_orderGraphp; // The OrderGraph - const V3Graph* const m_fineDepsGraphp; // Fine-grained dependency graph - - LogicMTask* m_entryMTaskp = nullptr; // Singular source vertex of the dependency graph - LogicMTask* m_exitMTaskp = nullptr; // Singular sink vertex of the dependency graph - -public: - // CONSTRUCTORS - explicit V3Partition(const OrderGraph* orderGraphp, const V3Graph* fineDepsGraphp) - : m_orderGraphp{orderGraphp} - , m_fineDepsGraphp{fineDepsGraphp} {} - ~V3Partition() = default; - - // METHODS - - // Fill in the provided empty graph with AbstractLogicMTask's and their - // interdependencies. - void go(V3Graph* mtasksp) VL_MT_DISABLED; - - // Print out a hash of the shape of graphp. Only needed to debug the - // origin of some nondeterminism; otherwise this is pretty useless. - static void hashGraphDebug(const V3Graph* graphp, const char* debugName) VL_MT_DISABLED; - - // Print debug stats about graphp whose nodes must be AbstractMTask's. - static void debugMTaskGraphStats(const V3Graph* graphp, const string& stage) VL_MT_DISABLED; - -private: - uint32_t setupMTaskDeps(V3Graph* mtasksp) VL_MT_DISABLED; - - VL_UNCOPYABLE(V3Partition); -}; - // ###################################################################### // Partitioner tunable settings: // @@ -208,10 +158,10 @@ constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26; // Don't produce more than a certain maximum number of MTasks. This helps // the TSP variable sort not to blow up (a concern for some of the tests) -// and we probably don't want a huge number of mtasks in practice anyway +// and we probably don't want a huge number of mTaskGraphp in practice anyway // (50 to 100 is typical.) // -// If the user doesn't give one with '--threads-max-mtasks', we'll set the +// If the user doesn't give one with '--threads-max-mTaskGraphp', we'll set the // maximum # of MTasks to // (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD) constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50; @@ -246,9 +196,7 @@ struct EdgeKey final { uint64_t m_id; // Unique ID part of edge score uint32_t m_score; // Score part of ID void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif + UDEBUGONLY(UASSERT(score >= m_score, "Must increase");); m_score = score; } bool operator<(const EdgeKey& other) const { @@ -262,10 +210,10 @@ using EdgeHeap = PairingHeap; //============================================================================= // LogicMTask -class LogicMTask final : public AbstractLogicMTask { - VL_RTTI_IMPL(LogicMTask, AbstractLogicMTask) +class LogicMTask final : public V3GraphVertex { + VL_RTTI_IMPL(LogicMTask, V3GraphVertex) template - friend class PartPropagateCp; + friend class PropagateCp; public: // TYPES @@ -317,7 +265,7 @@ private: public: // CONSTRUCTORS LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) - : AbstractLogicMTask{graphp} { + : V3GraphVertex{graphp} { for (uint32_t& item : m_critPathCost) item = 0; if (mtmvVxp) { // Else null for test m_mvertices.push_back(mtmvVxp); @@ -328,7 +276,7 @@ public: // Start at 1, so that 0 indicates no mtask ID. static uint32_t s_nextId = 1; m_serialId = s_nextId++; - UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks"); + UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mTaskGraphp"); } // METHODS @@ -341,7 +289,7 @@ public: m_mvertices.splice(m_mvertices.end(), otherp->m_mvertices); m_cost += otherp->m_cost; } - const VxList* vertexListp() const override { return &m_mvertices; } + const VxList& vertexList() const { return m_mvertices; } static uint64_t incGeneration() { static uint64_t s_generation = 0; ++s_generation; @@ -349,12 +297,12 @@ public: } // Use this instead of pointer-compares to compare LogicMTasks. Avoids - // nondeterministic output. Also name mtasks based on this number in + // nondeterministic output. Also name mTaskGraphp based on this number in // the final C++ output. - uint32_t id() const override { return m_serialId; } + uint32_t id() const { return m_serialId; } void id(uint32_t id) { m_serialId = id; } // Abstract cost of every logic mtask - uint32_t cost() const override VL_MT_SAFE { return m_cost; } + uint32_t cost() const VL_MT_SAFE { return m_cost; } void setCost(uint32_t cost) { m_cost = cost; } // For tests only uint32_t stepCost() const { return stepCost(m_cost); } static uint32_t stepCost(uint32_t cost) { @@ -373,10 +321,8 @@ public: logcost = logcost / 20.0; const uint32_t stepCost = static_cast(exp(logcost)); -#if VL_DEBUG - UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); - UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); -#endif + UDEBUGONLY(UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded");); + UDEBUGONLY(UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded");); return stepCost; #else return cost; @@ -392,16 +338,12 @@ public: void addRelativeMTask(LogicMTask* relativep) { // Add the relative to connecting edge map - VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second; -#if VL_DEBUG - UASSERT(!exits, "Adding existing relative"); -#endif + const bool exits = !m_edgeSet.emplace(relativep).second; + UDEBUGONLY(UASSERT(!exits, "Adding existing relative");); } void removeRelativeMTask(LogicMTask* relativep) { - VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep); -#if VL_DEBUG - UASSERT(removed, "Relative should have been in set"); -#endif + const size_t removed = m_edgeSet.erase(relativep); + UDEBUGONLY(UASSERT(removed, "Relative should have been in set");); } bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); } @@ -475,7 +417,7 @@ public: return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); } - static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment); + static void dumpCpFilePrefixed(const V3Graph& graph, const string& nameComment); private: VL_UNCOPYABLE(LogicMTask); @@ -484,14 +426,6 @@ private: //###################################################################### // MTask utility classes -// Sort AbstractMTask objects into deterministic order by calling id() -// which is a unique and stable serial number. -struct MTaskIdLessThan final { - bool operator()(const AbstractMTask* lhsp, const AbstractMTask* rhsp) const { - return lhsp->id() < rhsp->id(); - } -}; - struct MergeCandidateKey final { // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node uint64_t m_id; // Unique ID part of edge score @@ -534,9 +468,7 @@ class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node { public: // METHODS SiblingMC* toSiblingMC(); // Instead of cast<>/as<> - const SiblingMC* toSiblingMC() const; // Instead of cast<>/as<> MTaskEdge* toMTaskEdge(); // Instead of cast<>/as<> - const MTaskEdge* toMTaskEdge() const; // Instead of cast<>/as<> bool mergeWouldCreateCycle() const; // Instead of virtual method inline void rescore(); @@ -599,7 +531,7 @@ class MTaskEdge final : public V3GraphEdge, public MergeCandidate { VL_RTTI_IMPL(MTaskEdge, V3GraphEdge) friend class LogicMTask; template - friend class PartPropagateCp; + friend class PropagateCp; // MEMBERS // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes @@ -689,10 +621,8 @@ uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withou // Compute the critical path cost wayward to this node, without considering edge 'withoutp'. // We need to look at two edges at most, the critical path if that is not via 'withoutp', // or the second-worst path, if the critical path is via 'withoutp'. -#if VL_DEBUG - UASSERT(withoutp->furtherp(way) == this, - "In critPathCostWithout(), edge 'withoutp' must further to 'this'"); -#endif + UDEBUGONLY(UASSERT(withoutp->furtherp(way) == this, + "In critPathCostWithout(), edge 'withoutp' must further to 'this'");); const GraphWay inv = way.invert(); const EdgeHeap& edgeHeap = m_edgeHeap[inv]; const EdgeHeap::Node* const maxp = edgeHeap.max(); @@ -703,7 +633,7 @@ uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withou return secp->key().m_score; } -void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { +void LogicMTask::dumpCpFilePrefixed(const V3Graph& graph, const string& nameComment) { const string filename = v3Global.debugFilename(nameComment) + ".txt"; UINFO(1, "Writing " << filename << endl); const std::unique_ptr ofp{V3File::new_ofstream(filename)}; @@ -712,7 +642,7 @@ void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameCom // Find start vertex with longest CP LogicMTask* startp = nullptr; - for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { LogicMTask* const mtaskp = static_cast(vxp); if (!startp) { startp = mtaskp; @@ -744,18 +674,11 @@ void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameCom // Dump for (const LogicMTask* mtaskp : path) { *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; - for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); - lit != mtaskp->vertexListp()->end(); ++lit) { - const OrderLogicVertex* const logicp = (*lit)->logicp(); + for (MTaskMoveVertex* const mVtxp : mtaskp->vertexList()) { + const OrderLogicVertex* const logicp = mVtxp->logicp(); if (!logicp) continue; - if (false) { - // Show nodes only - *osp << "> "; - logicp->nodep()->dumpTree(*osp); - } else { - // Show nodes with hierarchical costs - V3InstrCount::count(logicp->nodep(), false, osp); - } + // Show nodes with hierarchical costs + V3InstrCount::count(logicp->nodep(), false, osp); } } } @@ -769,14 +692,6 @@ MTaskEdge* MergeCandidate::toMTaskEdge() { return isSiblingMC() ? nullptr : static_cast(this); } -const SiblingMC* MergeCandidate::toSiblingMC() const { - return isSiblingMC() ? static_cast(this) : nullptr; -} - -const MTaskEdge* MergeCandidate::toMTaskEdge() const { - return isSiblingMC() ? nullptr : static_cast(this); -} - // Normally this would be a virtual function, but we save space by not having a vtable, // and we know we only have 2 possible subclasses. bool MergeCandidate::mergeWouldCreateCycle() const { @@ -796,10 +711,10 @@ static uint32_t siblingScore(const SiblingMC* sibsp) { static uint32_t edgeScore(const MTaskEdge* edgep) { // Score this edge. Lower is better. The score is the new local CP - // length if we merge these mtasks. ("Local" means the longest + // length if we merge these mTaskGraphp. ("Local" means the longest // critical path running through the merged node.) - const LogicMTask* const top = static_cast(edgep->top()); - const LogicMTask* const fromp = static_cast(edgep->fromp()); + const LogicMTask* const top = edgep->toMTaskp(); + const LogicMTask* const fromp = edgep->fromMTaskp(); const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD), top->critPathCostWithout(GraphWay::FORWARD, edgep)); const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), @@ -822,8 +737,8 @@ void MergeCandidate::rescore() { // Look at vertex costs (in one way) to form critical paths for each // vertex. -static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool checkOnly) { - GraphStreamUnordered order(mtasksp, way); +static void partInitHalfCriticalPaths(GraphWay way, V3Graph& mTaskGraph, bool checkOnly) { + GraphStreamUnordered order{&mTaskGraph, way}; const GraphWay rev = way.invert(); for (const V3GraphVertex* vertexp; (vertexp = order.nextp());) { const LogicMTask* const mtaskcp = static_cast(vertexp); @@ -836,9 +751,10 @@ static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool check #if VL_DEBUG // Run a few asserts on the initial mtask graph, // while we're iterating through... - UASSERT_OBJ(edgep->weight() != 0, mtaskp, "Should be no cut edges in mtasks graph"); + UASSERT_OBJ(edgep->weight() != 0, mtaskp, + "Should be no cut edges in mTaskGraphp graph"); UASSERT_OBJ(relatives.find(edgep->furtherp(rev)) == relatives.end(), mtaskp, - "Should be no redundant edges in mtasks graph"); + "Should be no redundant edges in mTaskGraphp graph"); relatives.insert(edgep->furtherp(rev)); #endif const LogicMTask* const relativep = static_cast(edgep->furtherp(rev)); @@ -854,13 +770,13 @@ static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool check } // Look at vertex costs to form critical paths for each vertex. -static void partInitCriticalPaths(V3Graph* mtasksp) { - partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false); - partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false); +static void partInitCriticalPaths(V3Graph& mTaskGraph) { + partInitHalfCriticalPaths(GraphWay::FORWARD, mTaskGraph, false); + partInitHalfCriticalPaths(GraphWay::REVERSE, mTaskGraph, false); // Reset all MTaskEdges so that 'm_edges' will show correct CP numbers. // They would have been all zeroes on initial creation of the MTaskEdges. - for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (V3GraphVertex* vxp = mTaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { for (V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { MTaskEdge* const mtedgep = edgep->as(); mtedgep->resetCriticalPaths(); @@ -870,10 +786,10 @@ static void partInitCriticalPaths(V3Graph* mtasksp) { // Do an EXPENSIVE check to make sure that all incremental CP updates have // gone correctly. -static void partCheckCriticalPaths(V3Graph* mtasksp) { - partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true); - partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true); - for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { +static void partCheckCriticalPaths(V3Graph& mTaskGraph) { + partInitHalfCriticalPaths(GraphWay::FORWARD, mTaskGraph, true); + partInitHalfCriticalPaths(GraphWay::REVERSE, mTaskGraph, true); + for (V3GraphVertex* vxp = mTaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { const LogicMTask* const mtaskp = static_cast(vxp); mtaskp->checkRelativesCp(GraphWay::FORWARD); mtaskp->checkRelativesCp(GraphWay::REVERSE); @@ -881,24 +797,25 @@ static void partCheckCriticalPaths(V3Graph* mtasksp) { } // ###################################################################### -// PartPropagateCp +// PropagateCp -// Propagate increasing critical path (CP) costs through a graph. -// -// Usage: -// * Client increases the cost and/or CP at a node or small set of nodes -// (often a pair in practice, eg. edge contraction.) -// * Client calls PartPropagateCp::cpHasIncreased() one or more times. -// Each call indicates that the inclusive CP of some "seed" vertex -// has increased to a given value. -// * NOTE: PartPropagateCp will neither read nor modify the cost -// or CPs at the seed vertices, it only accesses and modifies -// vertices wayward from the seeds. -// * Client calls PartPropagateCp::go(). Internally, this iteratively -// propagates the new CPs wayward through the graph. -// template -class PartPropagateCp final { +class PropagateCp final { + // Propagate increasing critical path (CP) costs through a graph. + // + // Usage: + // * Client increases the cost and/or CP at a node or small set of nodes + // (often a pair in practice, eg. edge contraction.) + // * Client calls PropagateCp::cpHasIncreased() one or more times. + // Each call indicates that the inclusive CP of some "seed" vertex + // has increased to a given value. + // * NOTE: PropagateCp will neither read nor modify the cost + // or CPs at the seed vertices, it only accesses and modifies + // vertices wayward from the seeds. + // * Client calls PropagateCp::go(). Internally, this iteratively + // propagates the new CPs wayward through the graph. + // + // TYPES // We keep pending vertices in a heap during critical path propagation @@ -906,9 +823,7 @@ class PartPropagateCp final { LogicMTask* m_mtaskp; // The vertex in the heap uint32_t m_score; // The score of this entry void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif + UDEBUGONLY(UASSERT(score >= m_score, "Must increase");); m_score = score; } bool operator<(const PendingKey& other) const { @@ -929,11 +844,12 @@ class PartPropagateCp final { std::vector> m_allocated; // Allocated heap nodes const bool m_slowAsserts; // Enable nontrivial asserts - std::set m_seen; // Used only with slow asserts to check mtasks visited only once + // Used only with slow asserts to check mTaskGraphp visited only once + std::set m_seen; public: // CONSTRUCTORS - explicit PartPropagateCp(bool slowAsserts) + explicit PropagateCp(bool slowAsserts) : m_slowAsserts{slowAsserts} {} // METHODS @@ -1045,7 +961,7 @@ public: const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score; UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge"); // Confirm that we only set each node's CP once. That's an - // important property of PartPropagateCp which allows it to be far + // important property of PropagateCp which allows it to be far // faster than a recursive algorithm on some graphs. const bool first = m_seen.insert(mtaskp).second; UASSERT_OBJ(first, mtaskp, "Set CP on node twice"); @@ -1058,25 +974,19 @@ public: } private: - VL_UNCOPYABLE(PartPropagateCp); -}; + VL_UNCOPYABLE(PropagateCp); -class PartPropagateCpSelfTest final { - // MEMBERS - V3Graph m_graph; // A graph - std::array m_vx; // All vertices within the graph +public: + static void selfTest() { + V3Graph graph; // A graph + std::array vx; // All vertices within the graph - // CONSTRUCTORS - PartPropagateCpSelfTest() = default; - ~PartPropagateCpSelfTest() = default; - - void go() { // Generate a pseudo-random graph std::array rngState = {{0x12345678ULL, 0x9abcdef0ULL}}; // GCC 3.8.0 wants {{}} // Create 50 vertices - for (auto& i : m_vx) { - i = new LogicMTask{&m_graph, nullptr}; + for (auto& i : vx) { + i = new LogicMTask{&graph, nullptr}; i->setCost(1); } // Create 250 edges at random. Edges must go from @@ -1085,24 +995,23 @@ class PartPropagateCpSelfTest final { const unsigned idx1 = V3Os::rand64(rngState) % 50; const unsigned idx2 = V3Os::rand64(rngState) % 50; if (idx1 > idx2) { - if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) { - new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1}; + if (!vx[idx2]->hasRelativeMTask(vx[idx1])) { + new MTaskEdge{&graph, vx[idx2], vx[idx1], 1}; } } else if (idx2 > idx1) { - if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) { - new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1}; + if (!vx[idx1]->hasRelativeMTask(vx[idx2])) { + new MTaskEdge{&graph, vx[idx1], vx[idx2], 1}; } } } - partInitCriticalPaths(&m_graph); + partInitCriticalPaths(graph); - // This SelfTest class is also the T_CostAccessor - PartPropagateCp prop(true); + PropagateCp prop{true}; // Seed the propagator with every input node; // This should result in the complete graph getting all CP's assigned. - for (const auto& i : m_vx) { + for (const auto& i : vx) { if (!i->inBeginp()) prop.cpHasIncreased(i, 1 /* inclusive CP starts at 1 */); } @@ -1110,48 +1019,44 @@ class PartPropagateCpSelfTest final { prop.go(); // Finally, confirm that the entire graph appears to have correct CPs. - partCheckCriticalPaths(&m_graph); + partCheckCriticalPaths(graph); } - -public: - static void selfTest() { PartPropagateCpSelfTest{}.go(); } }; // Merge edges from a LogicMtask. -// -// This code removes adjacent edges. When this occurs, mark it in need -// of a rescore, in case its score has fallen and we need to move it up -// toward the front of the scoreboard. -// -// Wait, what? Shouldn't the scores only increase as we merge nodes? Well -// that's almost true. But there is one exception. -// -// Suppose we have A->B, B->C, and A->C. -// -// The A->C edge is a "transitive" edge. It's ineligible to be merged, as -// the merge would create a cycle. We score it on the scoreboard like any -// other edge. -// -// However, our "score" estimate for A->C is bogus, because the forward -// critical path to C and the reverse critical path to A both contain the -// same node (B) so we overestimate the score of A->C. At first this -// doesn't matter, since transitive edges aren't eligible to merge anyway. -// -// Later, suppose the edge contractor decides to merge the B->C edge, with -// B donating all its incoming edges into C, say. (So we reach this -// function.) -// -// With B going away, the A->C edge will no longer be transitive and it -// will become eligible to merge. But if we don't mark it for rescore, -// it'll stay in the scoreboard with its old (overestimate) score. We'll -// merge it too late due to the bogus score. When we finally merge it, we -// fail the assert in the main edge contraction loop which checks that the -// actual score did not fall below the scoreboard's score. -// -// Another way of stating this: this code ensures that scores of -// non-transitive edges only ever increase. -static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp, +static void partRedirectEdgesFrom(V3Graph& graph, LogicMTask* recipientp, LogicMTask* donorp, MergeCandidateScoreboard* sbp) { + // This code removes adjacent edges. When this occurs, mark it in need + // of a rescore, in case its score has fallen and we need to move it up + // toward the front of the scoreboard. + // + // Wait, what? Shouldn't the scores only increase as we merge nodes? Well + // that's almost true. But there is one exception. + // + // Suppose we have A->B, B->C, and A->C. + // + // The A->C edge is a "transitive" edge. It's ineligible to be merged, as + // the merge would create a cycle. We score it on the scoreboard like any + // other edge. + // + // However, our "score" estimate for A->C is bogus, because the forward + // critical path to C and the reverse critical path to A both contain the + // same node (B) so we overestimate the score of A->C. At first this + // doesn't matter, since transitive edges aren't eligible to merge anyway. + // + // Later, suppose the edge contractor decides to merge the B->C edge, with + // B donating all its incoming edges into C, say. (So we reach this + // function.) + // + // With B going away, the A->C edge will no longer be transitive and it + // will become eligible to merge. But if we don't mark it for rescore, + // it'll stay in the scoreboard with its old (overestimate) score. We'll + // merge it too late due to the bogus score. When we finally merge it, we + // fail the assert in the main edge contraction loop which checks that the + // actual score did not fall below the scoreboard's score. + // + // Another way of stating this: this code ensures that scores of + // non-transitive edges only ever increase. // Process outgoing edges MTaskEdge* outNextp = static_cast(donorp->outBeginp()); @@ -1169,9 +1074,7 @@ static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, Logic if (sbp->contains(edgep)) sbp->remove(edgep); MTaskEdge* const existMTaskEdgep = static_cast( recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif + UDEBUGONLY(UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");); if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); } VL_DO_DANGLING(edgep->unlinkDelete(), edgep); @@ -1209,9 +1112,7 @@ static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, Logic if (sbp->contains(edgep)) sbp->remove(edgep); MTaskEdge* const existMTaskEdgep = static_cast( recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif + UDEBUGONLY(UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");); if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); } VL_DO_DANGLING(edgep->unlinkDelete(), edgep); @@ -1233,14 +1134,14 @@ static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, Logic } // Remove donorp from the graph - VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp); + VL_DO_DANGLING(donorp->unlinkDelete(&graph), donorp); } //###################################################################### -// PartContraction +// Contraction // Perform edge or sibling contraction on the partition graph -class PartContraction final { +class Contraction final { // TYPES // New CP information for mtaskp reflecting an upcoming merge struct NewCp final { @@ -1250,39 +1151,36 @@ class PartContraction final { }; // MEMBERS - V3Graph* const m_mtasksp; // Mtask graph + V3Graph& m_mTaskGraph; // The Mtask graph uint32_t m_scoreLimit; // Sloppy score allowed when picking merges uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at unsigned m_mergesSinceRescore = 0; // Merges since last rescore const bool m_slowAsserts; // Take extra time to validate algorithm MergeCandidateScoreboard m_sb; // Scoreboard - PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator - PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator + PropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator + PropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator LogicMTask* const m_entryMTaskp; // Singular source vertex of the dependency graph LogicMTask* const m_exitMTaskp; // Singular sink vertex of the dependency graph public: // CONSTRUCTORS - PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, LogicMTask* entryMTaskp, - LogicMTask* exitMTaskp, bool slowAsserts) - : m_mtasksp{mtasksp} + Contraction(V3Graph& mTaskGraph, uint32_t scoreLimit, LogicMTask* entryMTaskp, + LogicMTask* exitMTaskp, bool slowAsserts) + : m_mTaskGraph{mTaskGraph} , m_scoreLimit{scoreLimit} , m_slowAsserts{slowAsserts} , m_entryMTaskp{entryMTaskp} - , m_exitMTaskp{exitMTaskp} {} - - // METHODS - void go() { + , m_exitMTaskp{exitMTaskp} { if (m_slowAsserts) { // Check there are no redundant edges - for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; + for (V3GraphVertex* itp = m_mTaskGraph.verticesBeginp(); itp; itp = itp->verticesNextp()) { std::unordered_set neighbors; for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { const bool first = neighbors.insert(edgep->top()).second; - UASSERT_OBJ(first, itp, "Redundant edge found in input to PartContraction()"); + UASSERT_OBJ(first, itp, "Redundant edge found in input to Contraction()"); } } } @@ -1292,22 +1190,22 @@ public: if (v3Global.opt.threads() > 1) { maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD * v3Global.opt.threads()); } else { - // Running PartContraction with --threads <= 1 means self-test + // Running Contraction with --threads <= 1 means self-test maxMTasks = 500; } } // OPTIMIZATION PASS: Edge contraction and sibling contraction. - // - Score each pair of mtasks which is a candidate to merge. + // - Score each pair of mTaskGraphp which is a candidate to merge. // * Each edge defines such a candidate pair - // * Two mtasks that are prereqs or postreqs of a common third + // * Two mTaskGraphp that are prereqs or postreqs of a common third // vertex are "siblings", these are also a candidate pair. // - Build a list of MergeCandidates, sorted by score. // - Merge the best pair. // - Incrementally recompute critical paths near the merged mtask. - for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - itp->userp(nullptr); // Reset user value while we are here. Used by PartPropagateCp. + for (V3GraphVertex* itp = m_mTaskGraph.verticesBeginp(); itp; itp = itp->verticesNextp()) { + itp->userp(nullptr); // Reset user value while we are here. Used by PropagateCp. for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { m_sb.add(static_cast(edgep)); } @@ -1358,10 +1256,10 @@ public: } else { // We've exhausted everything below m_scoreLimit; stop. - // Except, if we have too many mtasks, raise the score + // Except, if we have too many mTaskGraphp, raise the score // limit and keep going... unsigned mtaskCount = 0; - for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; + for (V3GraphVertex* vxp = m_mTaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { ++mtaskCount; } @@ -1513,8 +1411,8 @@ private: MTaskEdge* const mergeEdgep = mergeCanp->toMTaskEdge(); SiblingMC* const mergeSibsp = mergeCanp->toSiblingMC(); if (mergeEdgep) { - top = static_cast(mergeEdgep->top()); - fromp = static_cast(mergeEdgep->fromp()); + top = mergeEdgep->toMTaskp(); + fromp = mergeEdgep->fromMTaskp(); } else { top = mergeSibsp->ap(); fromp = mergeSibsp->bp(); @@ -1540,9 +1438,9 @@ private: // Recursively update forward and reverse CP numbers. // - // Doing this before merging the mtasks lets us often avoid + // Doing this before merging the mTaskGraphp lets us often avoid // recursing through either incoming or outgoing edges on one or - // both mtasks. + // both mTaskGraphp. // // These 'NewCp' objects carry a bit indicating whether we must // propagate CP for each of the four cases: @@ -1601,13 +1499,13 @@ private: removeSiblingMCs(recipientp, donorp); // Redirect all edges, delete donorp - partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); + partRedirectEdgesFrom(m_mTaskGraph, recipientp, donorp, &m_sb); ++m_mergesSinceRescore; // Do an expensive check, confirm we haven't botched the CP // updates. - if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp); + if (m_slowAsserts) partCheckCriticalPaths(m_mTaskGraph); // Finally, make new sibling pairs as needed: // - prereqs and postreqs of recipientp @@ -1745,29 +1643,29 @@ private: } static uint64_t partitionChainUsecs(unsigned chain_len) { - // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. + // NOTE: To get a dot file run with --debugi-Partitioner 4 or more. const uint64_t startUsecs = V3Os::timeUsecs(); - V3Graph mtasks; + V3Graph mTaskGraph; LogicMTask* lastp = nullptr; for (unsigned i = 0; i < chain_len; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + LogicMTask* const mtp = new LogicMTask{&mTaskGraph, nullptr}; mtp->setCost(1); - if (lastp) new MTaskEdge{&mtasks, lastp, mtp, 1}; + if (lastp) new MTaskEdge{&mTaskGraph, lastp, mtp, 1}; lastp = mtp; } - partInitCriticalPaths(&mtasks); + partInitCriticalPaths(mTaskGraph); // Since slowAsserts mode is *expected* to cause N^2 runtime, and the // intent of this test is to demonstrate better-than-N^2 runtime, disable // slowAsserts. - PartContraction ec{&mtasks, + Contraction::apply(mTaskGraph, // Any CP limit >chain_len should work: - chain_len * 2, nullptr, nullptr, false /* slowAsserts */}; - ec.go(); + chain_len * 2, nullptr, nullptr, /* slowAsserts: */ false); // All vertices should merge into one UASSERT_SELFTEST( - bool, mtasks.verticesBeginp() && !mtasks.verticesBeginp()->verticesNextp(), true); + bool, mTaskGraph.verticesBeginp() && !mTaskGraph.verticesBeginp()->verticesNextp(), + true); const uint64_t endUsecs = V3Os::timeUsecs(); const uint64_t elapsedUsecs = endUsecs - startUsecs; @@ -1790,32 +1688,32 @@ private: // merges in the event that scores are tied. This is better for the // test and also real designs. static void selfTestX() { - // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. - V3Graph mtasks; - LogicMTask* const centerp = new LogicMTask{&mtasks, nullptr}; + // NOTE: To get a dot file run with --debugi-Partitioner 4 or more. + V3Graph mTaskGraph; + LogicMTask* const centerp = new LogicMTask{&mTaskGraph, nullptr}; centerp->setCost(1); unsigned i; for (i = 0; i < 50; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + LogicMTask* const mtp = new LogicMTask{&mTaskGraph, nullptr}; mtp->setCost(1); // Edge from every input -> centerp - new MTaskEdge{&mtasks, mtp, centerp, 1}; + new MTaskEdge{&mTaskGraph, mtp, centerp, 1}; } for (i = 0; i < 50; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + LogicMTask* const mtp = new LogicMTask{&mTaskGraph, nullptr}; mtp->setCost(1); // Edge from centerp -> every output - new MTaskEdge{&mtasks, centerp, mtp, 1}; + new MTaskEdge{&mTaskGraph, centerp, mtp, 1}; } - partInitCriticalPaths(&mtasks); - PartContraction{&mtasks, 20, nullptr, nullptr, true}.go(); + partInitCriticalPaths(mTaskGraph); + Contraction::apply(mTaskGraph, 20, nullptr, nullptr, true); - const auto report = mtasks.parallelismReport( + const auto report = mTaskGraph.parallelismReport( [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); // Checking exact values here is maybe overly precise. What we're - // mostly looking for is a healthy reduction in the number of mtasks. + // mostly looking for is a healthy reduction in the number of mTaskGraphp. UASSERT_SELFTEST(uint32_t, report.criticalPathCost(), 19); UASSERT_SELFTEST(uint32_t, report.totalGraphCost(), 101); UASSERT_SELFTEST(uint32_t, report.vertexCount(), 14); @@ -1828,8 +1726,10 @@ public: selfTestChain(); } -private: - VL_UNCOPYABLE(PartContraction); + static void apply(V3Graph& mTaskGraph, uint32_t scoreLimit, LogicMTask* entryMTaskp, + LogicMTask* exitMTaskp, bool slowAsserts) { + Contraction{mTaskGraph, scoreLimit, entryMTaskp, exitMTaskp, slowAsserts}; + } }; //###################################################################### @@ -1871,170 +1771,112 @@ private: }; //###################################################################### -// PartFixDataHazards +// FixDataHazards + +class FixDataHazards final { + // + // Fix data hazards in the MTask graph. + // + // The fine-grained graph from V3Order may contain data hazards which are + // not a problem for serial mode, but which would be a problem in parallel + // mode. + // + // There are basically two classes: unordered pairs of writes, and + // unordered write-read pairs. We fix both here, with a combination of + // MTask-merges and new edges to ensure no such unordered pairs remain. + // + // ABOUT UNORDERED WRITE-WRITE PAIRS + // + // The V3Order dependency graph treats these as unordered events: + // + // a) sig[15:8] = stuff; + // ... + // b) sig[7:0] = other_stuff; + // + // Seems OK right? They are writes to disjoint bits of the same + // signal. They can run in either order, in serial mode, and the result + // will be the same. + // + // The resulting C code for each of this isn't a pure write, it's + // actually an R-M-W sequence: + // + // a) sig = (sig & 0xff) | (0xff00 & (stuff << 8)); + // ... + // b) sig = (sig & 0xff00) | (0xff & other_stuff); + // + // In serial mode, order doesn't matter so long as these run serially. + // In parallel mode, we must serialize these RMW's to avoid a race. + // + // We don't actually check here if each write would involve an R-M-W, we + // just assume that it would. If this routine ever causes a drastic + // increase in critical path, it could be optimized to make a better + // prediction (with all the risk that word implies!) about whether a + // given write is likely to turn into an R-M-W. + // + // ABOUT UNORDERED WRITE-READ PAIRS + // + // If we don't put unordered write-read pairs into some order at Verilation + // time, we risk a runtime race. + // + // How do such unordered writer/reader pairs happen? Here's a partial list + // of scenarios: + // + // Case 1: Circular logic + // + // If the design has circular logic, V3Order has by now generated some + // dependency cycles, and also cut some of the edges to make it + // acyclic. + // + // For serial mode, that was fine. We can break logic circles at an + // arbitrary point. At runtime, we'll repeat the _eval() until no + // changes are detected, which papers over the discarded dependency. + // + // For parallel mode, this situation can lead to unordered reads and + // writes of the same variable, causing a data race. For example if the + // original code is this: + // + // assign b = b | a << 2; + // assign out = b; + // + // ... there's originally a dependency edge which records that 'b' + // depends on the first assign. V3Order may cut this edge, making the + // statements unordered. In serial mode that's fine, they can run in + // either order. In parallel mode it's a reader/writer race. + // + // Case 2: Race Condition in Verilog Sources + // + // If the input has races, eg. blocking assignments in always blocks + // that share variables, the graph at this point will contain unordered + // writes and reads (or unordered write-write pairs) reflecting that. + // + // Case 3: Interesting V3Order Behavior + // + // There's code in V3Order that explicitly avoids making a dependency + // edge from a clock-gater signal to the logic node that produces the + // clock signal. This leads to unordered reader/writer pairs in + // parallel mode. + // -// Fix data hazards in the partition graph. -// -// The fine-grained graph from V3Order may contain data hazards which are -// not a problem for serial mode, but which would be a problem in parallel -// mode. -// -// There are basically two classes: unordered pairs of writes, and -// unordered write-read pairs. We fix both here, with a combination of -// MTask-merges and new edges to ensure no such unordered pairs remain. -// -// ABOUT UNORDERED WRITE-WRITE PAIRS -// -// The V3Order dependency graph treats these as unordered events: -// -// a) sig[15:8] = stuff; -// ... -// b) sig[7:0] = other_stuff; -// -// Seems OK right? They are writes to disjoint bits of the same -// signal. They can run in either order, in serial mode, and the result -// will be the same. -// -// The resulting C code for each of this isn't a pure write, it's -// actually an R-M-W sequence: -// -// a) sig = (sig & 0xff) | (0xff00 & (stuff << 8)); -// ... -// b) sig = (sig & 0xff00) | (0xff & other_stuff); -// -// In serial mode, order doesn't matter so long as these run serially. -// In parallel mode, we must serialize these RMW's to avoid a race. -// -// We don't actually check here if each write would involve an R-M-W, we -// just assume that it would. If this routine ever causes a drastic -// increase in critical path, it could be optimized to make a better -// prediction (with all the risk that word implies!) about whether a -// given write is likely to turn into an R-M-W. -// -// ABOUT UNORDERED WRITE-READ PAIRS -// -// If we don't put unordered write-read pairs into some order at Verilation -// time, we risk a runtime race. -// -// How do such unordered writer/reader pairs happen? Here's a partial list -// of scenarios: -// -// Case 1: Circular logic -// -// If the design has circular logic, V3Order has by now generated some -// dependency cycles, and also cut some of the edges to make it -// acyclic. -// -// For serial mode, that was fine. We can break logic circles at an -// arbitrary point. At runtime, we'll repeat the _eval() until no -// changes are detected, which papers over the discarded dependency. -// -// For parallel mode, this situation can lead to unordered reads and -// writes of the same variable, causing a data race. For example if the -// original code is this: -// -// assign b = b | a << 2; -// assign out = b; -// -// ... there's originally a dependency edge which records that 'b' -// depends on the first assign. V3Order may cut this edge, making the -// statements unordered. In serial mode that's fine, they can run in -// either order. In parallel mode it's a reader/writer race. -// -// Case 2: Race Condition in Verilog Sources -// -// If the input has races, eg. blocking assignments in always blocks -// that share variables, the graph at this point will contain unordered -// writes and reads (or unordered write-write pairs) reflecting that. -// -// Case 3: Interesting V3Order Behavior -// -// There's code in V3Order that explicitly avoids making a dependency -// edge from a clock-gater signal to the logic node that produces the -// clock signal. This leads to unordered reader/writer pairs in -// parallel mode. -// -class PartFixDataHazards final { // TYPES + // Sort LogicMTask objects into deterministic order by calling id() + // which is a unique and stable serial number. + struct MTaskIdLessThan final { + bool operator()(const LogicMTask* lhsp, const LogicMTask* rhsp) const { + return lhsp->id() < rhsp->id(); + } + }; using TasksByRank = std::map>; // MEMBERS - const OrderGraph* const m_orderGraphp; // The OrderGraph - V3Graph* const m_mtasksp; // Mtask graph -public: + V3Graph& m_mTaskGraph; // The Mtask graph + // CONSTRUCTORs - explicit PartFixDataHazards(const OrderGraph* orderGraphp, V3Graph* mtasksp) - : m_orderGraphp{orderGraphp} - , m_mtasksp{mtasksp} {} - // METHODS -private: - void findAdjacentTasks(const OrderVarStdVertex* varVtxp, TasksByRank& tasksByRank) { - // Find all writer tasks for this variable, group by rank. - for (V3GraphEdge* edgep = varVtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { - if (const auto* const logicVtxp = edgep->fromp()->cast()) { - LogicMTask* const writerMtaskp = static_cast(logicVtxp->userp()); - tasksByRank[writerMtaskp->rank()].insert(writerMtaskp); - } - } - // Not: Find all reader tasks for this variable, group by rank. - // There was "broken" code here to find readers, but fixing it to - // work properly harmed performance on some tests, see issue #3360. - } - void mergeSameRankTasks(const TasksByRank& tasksByRank) { - LogicMTask* lastRecipientp = nullptr; - for (const auto& pair : tasksByRank) { - // Find the largest node at this rank, merge into it. (If we - // happen to find a huge node, this saves time in - // partRedirectEdgesFrom() versus merging into an arbitrary node.) - LogicMTask* recipientp = nullptr; - for (LogicMTask* const mtaskp : pair.second) { - if (!recipientp || (recipientp->cost() < mtaskp->cost())) recipientp = mtaskp; - } - UASSERT_OBJ(!lastRecipientp || (lastRecipientp->rank() < recipientp->rank()), - recipientp, "Merging must be on lower rank"); - - for (LogicMTask* const donorp : pair.second) { - // Merge donor into recipient. - if (donorp == recipientp) continue; - // Fix up the map, so donor's OLVs map to recipientp - for (const MTaskMoveVertex* const tmvp : *(donorp->vertexListp())) { - tmvp->logicp()->userp(recipientp); - } - // Move all vertices from donorp to recipientp - recipientp->moveAllVerticesFrom(donorp); - // Redirect edges from donorp to recipientp, delete donorp - partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, nullptr); - } - - if (lastRecipientp && !lastRecipientp->hasRelativeMTask(recipientp)) { - new MTaskEdge{m_mtasksp, lastRecipientp, recipientp, 1}; - } - lastRecipientp = recipientp; - } - } - bool hasDpiHazard(LogicMTask* mtaskp) { - for (const MTaskMoveVertex* const moveVtxp : *(mtaskp->vertexListp())) { - if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) { - // NOTE: We don't handle DPI exports. If testbench code calls a - // DPI-exported function at any time during eval() we may have - // a data hazard. (Likewise in non-threaded mode if an export - // messes with an ordered variable we're broken.) - - // Find all calls to DPI-imported functions, we can put those - // into a serial order at least. That should solve the most - // likely DPI-related data hazards. - if (DpiImportCallVisitor{lvtxp->nodep()}.hasDpiHazard()) return true; - } - } - return false; - } - -public: - void go() { + FixDataHazards(const OrderGraph& orderGraph, V3Graph& mTaskGraph) + : m_mTaskGraph{mTaskGraph} { // Rank the graph. DGS is faster than V3GraphAlg's recursive rank, and also allows us to // set up the OrderLogicVertex -> LogicMTask map at the same time. { - GraphStreamUnordered serialize{m_mtasksp}; + GraphStreamUnordered serialize{&m_mTaskGraph}; while (LogicMTask* const mtaskp = const_cast(static_cast(serialize.nextp()))) { // Compute and assign rank @@ -2046,11 +1888,10 @@ public: // Set up the OrderLogicVertex -> LogicMTask map // Entry and exit MTasks have no MTaskMoveVertices under them, so move on - if (mtaskp->vertexListp()->empty()) continue; + if (mtaskp->vertexList().empty()) continue; // Otherwise there should be only one MTaskMoveVertex in each MTask at this stage - UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, - "Multiple MTaskMoveVertex"); - const MTaskMoveVertex* const moveVtxp = mtaskp->vertexListp()->front(); + UASSERT_OBJ(mtaskp->vertexList().size() == 1, mtaskp, "Multiple MTaskMoveVertex"); + const MTaskMoveVertex* const moveVtxp = mtaskp->vertexList().front(); // Set up mapping back to the MTask from the OrderLogicVertex if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) lvtxp->userp(mtaskp); } @@ -2059,7 +1900,7 @@ public: // Gather all variables. SystemC vars will be handled slightly specially, so keep separate. std::vector regularVars; std::vector systemCVars; - for (V3GraphVertex *vtxp = m_orderGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + for (V3GraphVertex *vtxp = orderGraph.verticesBeginp(), *nextp; vtxp; vtxp = nextp) { nextp = vtxp->verticesNextp(); // Only consider OrderVarStdVertex which reflects // an actual lvalue assignment; the others do not. @@ -2072,21 +1913,21 @@ public: } } - // For each OrderVarVertex, look at its writer and reader mtasks. + // For each OrderVarVertex, look at its writer and reader mTaskGraphp. // // If there's a set of writers and readers at the same rank, we // know these are unordered with respect to one another, so merge - // those mtasks all together. + // those mTaskGraphp all together. // // At this point, we have at most one merged mtask per rank (for a - // given OVV.) Create edges across these remaining mtasks to ensure + // given OVV.) Create edges across these remaining mTaskGraphp to ensure // they run in serial order (going along with the existing ranks.) // // NOTE: we don't update the CP's stored in the LogicMTasks to // reflect the changes we make to the graph. That's OK, as we // haven't yet initialized CPs when we call this routine. for (const OrderVarStdVertex* const varVtxp : regularVars) { - // Build a set of mtasks, per rank, which access this var. + // Build a set of mTaskGraphp, per rank, which access this var. // Within a rank, sort by MTaskID to avoid nondeterminism. TasksByRank tasksByRank; @@ -2136,7 +1977,7 @@ public: // Same basic strategy as above to serialize access to SC vars. if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) { TasksByRank tasksByRank; - for (V3GraphVertex *vtxp = m_mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + for (V3GraphVertex *vtxp = m_mTaskGraph.verticesBeginp(), *nextp; vtxp; vtxp = nextp) { nextp = vtxp->verticesNextp(); LogicMTask* const mtaskp = static_cast(vtxp); if (hasDpiHazard(mtaskp)) tasksByRank[mtaskp->rank()].insert(mtaskp); @@ -2145,14 +1986,81 @@ public: } } -private: - VL_UNCOPYABLE(PartFixDataHazards); + // METHODS + void findAdjacentTasks(const OrderVarStdVertex* varVtxp, TasksByRank& tasksByRank) { + // Find all writer tasks for this variable, group by rank. + for (V3GraphEdge* edgep = varVtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { + if (const auto* const logicVtxp = edgep->fromp()->cast()) { + LogicMTask* const writerMtaskp = static_cast(logicVtxp->userp()); + tasksByRank[writerMtaskp->rank()].insert(writerMtaskp); + } + } + // Not: Find all reader tasks for this variable, group by rank. + // There was "broken" code here to find readers, but fixing it to + // work properly harmed performance on some tests, see issue #3360. + } + void mergeSameRankTasks(const TasksByRank& tasksByRank) { + LogicMTask* lastRecipientp = nullptr; + for (const auto& pair : tasksByRank) { + // Find the largest node at this rank, merge into it. (If we + // happen to find a huge node, this saves time in + // partRedirectEdgesFrom() versus merging into an arbitrary node.) + LogicMTask* recipientp = nullptr; + for (LogicMTask* const mtaskp : pair.second) { + if (!recipientp || (recipientp->cost() < mtaskp->cost())) recipientp = mtaskp; + } + UASSERT_OBJ(!lastRecipientp || (lastRecipientp->rank() < recipientp->rank()), + recipientp, "Merging must be on lower rank"); + + for (LogicMTask* const donorp : pair.second) { + // Merge donor into recipient. + if (donorp == recipientp) continue; + // Fix up the map, so donor's OLVs map to recipientp + for (const MTaskMoveVertex* const tmvp : donorp->vertexList()) { + tmvp->logicp()->userp(recipientp); + } + // Move all vertices from donorp to recipientp + recipientp->moveAllVerticesFrom(donorp); + // Redirect edges from donorp to recipientp, delete donorp + partRedirectEdgesFrom(m_mTaskGraph, recipientp, donorp, nullptr); + } + + if (lastRecipientp && !lastRecipientp->hasRelativeMTask(recipientp)) { + new MTaskEdge{&m_mTaskGraph, lastRecipientp, recipientp, 1}; + } + lastRecipientp = recipientp; + } + } + bool hasDpiHazard(LogicMTask* mtaskp) { + for (const MTaskMoveVertex* const moveVtxp : mtaskp->vertexList()) { + if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) { + // NOTE: We don't handle DPI exports. If testbench code calls a + // DPI-exported function at any time during eval() we may have + // a data hazard. (Likewise in non-threaded mode if an export + // messes with an ordered variable we're broken.) + + // Find all calls to DPI-imported functions, we can put those + // into a serial order at least. That should solve the most + // likely DPI-related data hazards. + if (DpiImportCallVisitor{lvtxp->nodep()}.hasDpiHazard()) return true; + } + } + return false; + } + + VL_UNCOPYABLE(FixDataHazards); + +public: + static void apply(const OrderGraph& orderGraph, V3Graph& mTaskGraph) { + FixDataHazards(orderGraph, mTaskGraph); + } }; //###################################################################### -// V3Partition implementation +// Partitioner implementation -void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) { +// Print debug stats about graphp whose nodes must be LogicMTask's. +static void debugMTaskGraphStats(const V3Graph& graph, const string& stage) { if (!debug() && !dumpLevel() && !dumpGraphLevel()) return; UINFO(4, "\n"); @@ -2162,10 +2070,10 @@ void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stag std::array mtaskCostHist; mtaskCostHist.fill(0); - for (const V3GraphVertex* mtaskp = graphp->verticesBeginp(); mtaskp; + for (const V3GraphVertex* mtaskp = graph.verticesBeginp(); mtaskp; mtaskp = mtaskp->verticesNextp()) { ++mtaskCount; - uint32_t mtaskCost = mtaskp->as()->cost(); + uint32_t mtaskCost = mtaskp->as()->cost(); totalCost += mtaskCost; unsigned log2Cost = 0; @@ -2190,14 +2098,14 @@ void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stag if (mtaskCount < 1000) { string filePrefix("ordermv_"); filePrefix += stage; - if (dumpGraphLevel() >= 4) graphp->dumpDotFilePrefixedAlways(filePrefix); + if (dumpGraphLevel() >= 4) graph.dumpDotFilePrefixedAlways(filePrefix); } // Look only at the cost of each mtask, neglect communication cost. // This will show us how much parallelism we expect, assuming cache-miss // costs are minor and the cost of running logic is the dominant cost. - const auto report = graphp->parallelismReport( - [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); + const auto report = graph.parallelismReport( + [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); V3Stats::addStat("MTask graph, " + stage + ", critical path cost", report.criticalPathCost()); V3Stats::addStat("MTask graph, " + stage + ", total graph cost", report.totalGraphCost()); V3Stats::addStat("MTask graph, " + stage + ", mtask count", report.vertexCount()); @@ -2217,17 +2125,17 @@ void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stag // Print a hash of the shape of graphp. If you are battling // nondeterminism, this can help to pinpoint where in the pipeline it's // creeping in. -void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) { +static void hashGraphDebug(const V3Graph& graph, const char* debugName) { // Disabled when there are no nondeterminism issues in flight. if (!v3Global.opt.debugNondeterminism()) return; std::unordered_map vx2Id; unsigned id = 0; - for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { vx2Id[vxp] = id++; } unsigned hash = 0; - for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { for (const V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { const V3GraphVertex* const top = edgep->top(); hash = vx2Id[top] + 31U * hash; // The K&R hash function @@ -2236,240 +2144,264 @@ void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) { UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl); } -// Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask -// graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph of: -// - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex -// (MTaskMoveVertex::logicp() != nullptr) -// - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair -// Our goal is to order the logic vertices. The second type of variable/domain vertices only carry -// dependencies and are eventually discarded. In order to reduce the working set size of -// PartContraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, and -// instead add the transitive dependencies directly, but only if adding the transitive edges -// directly does not require more dependency edges than keeping the intermediate vertex. That is, -// we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be true if fanIn -// or fanOut are 1, or if they are both 2. This can cause significant reduction in working set -// size. -static bool bypassOk(MTaskMoveVertex* mvtxp) { - // Need to keep all logic vertices - if (mvtxp->logicp()) return false; - // Count fan-in, up to 3 - unsigned fanIn = 0; - for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { - if (++fanIn == 3) break; - } - UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn");); - // If fanInn no more than one, bypass - if (fanIn <= 1) return true; - // Count fan-out, up to 3 - unsigned fanOut = 0; - for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) { - if (++fanOut == 3) break; - } - UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut");); - // If fan-out no more than one, bypass - if (fanOut <= 1) return true; - // They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2) - return fanIn + fanOut == 4; -} +//************************************************************************* +// Partitioner takes the fine-grained logic graph from V3Order and +// collapses it into a coarse-grained graph of LogicMTask's, each +// of which contains of set of the logic nodes from the fine-grained +// graph. -uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { - uint32_t totalGraphCost = 0; +class Partitioner final { + // MEMBERS + const V3Graph& m_fineDepsGraph; // Fine-grained dependency graph + std::unique_ptr m_mTaskGraphp{new V3Graph{}}; // The resulting MTask graph - // Artificial single entry point vertex in the MTask graph to allow sibling merges. - // This is required as otherwise disjoint sub-graphs could not be merged, but the - // coarsening algorithm assumes that the graph is connected. - m_entryMTaskp = new LogicMTask{mtasksp, nullptr}; + LogicMTask* m_entryMTaskp = nullptr; // Singular source vertex of the dependency graph + LogicMTask* m_exitMTaskp = nullptr; // Singular sink vertex of the dependency graph - // The V3InstrCount within LogicMTask will set user1 on each AST - // node, to assert that we never count any node twice. - const VNUser1InUse user1inUse; + // METHODS - // Create the LogicMTasks for each MTaskMoveVertex - for (V3GraphVertex *vtxp = m_fineDepsGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - MTaskMoveVertex* const mVtxp = static_cast(vtxp); - if (bypassOk(mVtxp)) { - mVtxp->userp(nullptr); // Set to nullptr to mark as bypassed - } else { - LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp}; - mVtxp->userp(mtaskp); - totalGraphCost += mtaskp->cost(); + // Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask + // graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph + // of: + // - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex + // (MTaskMoveVertex::logicp() != nullptr) + // - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair + // Our goal is to order the logic vertices. The second type of variable/domain vertices only + // carry dependencies and are eventually discarded. In order to reduce the working set size of + // Contraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, + // and instead add the transitive dependencies directly, but only if adding the transitive + // edges directly does not require more dependency edges than keeping the intermediate vertex. + // That is, we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be + // true if fanIn or fanOut are 1, or if they are both 2. This can cause significant reduction + // in working set size. + static bool bypassOk(MTaskMoveVertex* mvtxp) { + // Need to keep all logic vertices + if (mvtxp->logicp()) return false; + // Count fan-in, up to 3 + unsigned fanIn = 0; + for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { + if (++fanIn == 3) break; } + UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn");); + // If fanInn no more than one, bypass + if (fanIn <= 1) return true; + // Count fan-out, up to 3 + unsigned fanOut = 0; + for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) { + if (++fanOut == 3) break; + } + UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut");); + // If fan-out no more than one, bypass + if (fanOut <= 1) return true; + // They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2) + return fanIn + fanOut == 4; } - // Artificial single exit point vertex in the MTask graph to allow sibling merges. - // this enables merging MTasks with no downstream dependents if that is the ideal merge. - m_exitMTaskp = new LogicMTask{mtasksp, nullptr}; + uint32_t setupMTaskDeps() VL_MT_DISABLED { + uint32_t totalGraphCost = 0; - // Create the mtask->mtask dependency edges based on the dependencies between MTaskMoveVertex - // vertices. - for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - LogicMTask* const mtaskp = static_cast(vtxp); + // Artificial single entry point vertex in the MTask graph to allow sibling merges. + // This is required as otherwise disjoint sub-graphs could not be merged, but the + // coarsening algorithm assumes that the graph is connected. + m_entryMTaskp = new LogicMTask{m_mTaskGraphp.get(), nullptr}; - // Entry and exit vertices handled separately - if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; + // The V3InstrCount within LogicMTask will set user1 on each AST + // node, to assert that we never count any node twice. + const VNUser1InUse user1inUse; - // At this point, there should only be one MTaskMoveVertex per LogicMTask - UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex"); - MTaskMoveVertex* const mvtxp = mtaskp->vertexListp()->front(); - UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask"); - - // Function to add a edge to a dependent from 'mtaskp' - const auto addEdge = [mtasksp, mtaskp](LogicMTask* otherp) { - UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge"); - if (mtaskp->hasRelativeMTask(otherp)) return; // Don't create redundant edges. - new MTaskEdge{mtasksp, mtaskp, otherp, 1}; - }; - - // Iterate downstream direct dependents - for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) { - dNextp = dEdgep->outNextp(); - V3GraphVertex* const top = dEdgep->top(); - if (LogicMTask* const otherp = static_cast(top->userp())) { - // The opposite end of the edge is not a bypassed vertex, add as direct dependent - addEdge(otherp); + // Create the LogicMTasks for each MTaskMoveVertex + for (V3GraphVertex *vtxp = m_fineDepsGraph.verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + MTaskMoveVertex* const mVtxp = static_cast(vtxp); + if (bypassOk(mVtxp)) { + mVtxp->userp(nullptr); // Set to nullptr to mark as bypassed } else { - // The opposite end of the edge is a bypassed vertex, add transitive dependents - for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; tEdgep = tNextp) { - tNextp = tEdgep->outNextp(); - LogicMTask* const transp = static_cast(tEdgep->top()->userp()); - // The Move graph is bipartite (logic <-> var), and logic is never bypassed, - // hence 'transp' must be non nullptr. - UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex"); - addEdge(transp); + LogicMTask* const mtaskp = new LogicMTask{m_mTaskGraphp.get(), mVtxp}; + mVtxp->userp(mtaskp); + totalGraphCost += mtaskp->cost(); + } + } + + // Artificial single exit point vertex in the MTask graph to allow sibling merges. + // this enables merging MTasks with no downstream dependents if that is the ideal merge. + m_exitMTaskp = new LogicMTask{m_mTaskGraphp.get(), nullptr}; + + // Create the mtask->mtask dependency edges based on the dependencies between + // MTaskMoveVertex vertices. + for (V3GraphVertex *vtxp = m_mTaskGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + LogicMTask* const mtaskp = static_cast(vtxp); + + // Entry and exit vertices handled separately + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; + + // At this point, there should only be one MTaskMoveVertex per LogicMTask + UASSERT_OBJ(mtaskp->vertexList().size() == 1, mtaskp, "Multiple MTaskMoveVertex"); + MTaskMoveVertex* const mvtxp = mtaskp->vertexList().front(); + UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask"); + + // Function to add a edge to a dependent from 'mtaskp' + const auto addEdge = [this, mtaskp](LogicMTask* otherp) { + UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge"); + if (mtaskp->hasRelativeMTask(otherp)) return; // Don't create redundant edges. + new MTaskEdge{m_mTaskGraphp.get(), mtaskp, otherp, 1}; + }; + + // Iterate downstream direct dependents + for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) { + dNextp = dEdgep->outNextp(); + V3GraphVertex* const top = dEdgep->top(); + if (LogicMTask* const otherp = static_cast(top->userp())) { + // The opposite end of the edge is not a bypassed vertex, add as direct + // dependent + addEdge(otherp); + } else { + // The opposite end of the edge is a bypassed vertex, add transitive dependents + for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; + tEdgep = tNextp) { + tNextp = tEdgep->outNextp(); + LogicMTask* const transp + = static_cast(tEdgep->top()->userp()); + // The Move graph is bipartite (logic <-> var), and logic is never + // bypassed, hence 'transp' must be non nullptr. + UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex"); + addEdge(transp); + } } } } - } - // Create Dependencies to/from the entry/exit vertices. - for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - LogicMTask* const mtaskp = static_cast(vtxp); + // Create Dependencies to/from the entry/exit vertices. + for (V3GraphVertex *vtxp = m_mTaskGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + LogicMTask* const mtaskp = static_cast(vtxp); - if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; - // Add the entry/exit edges - if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, m_entryMTaskp, mtaskp, 1}; - if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, m_exitMTaskp, 1}; - } - - return totalGraphCost; -} - -void V3Partition::go(V3Graph* mtasksp) { - // Called by V3Order - hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps"); - - // Create the first MTasks. Initially, each MTask just wraps one - // MTaskMoveVertex. Over time, we'll merge MTasks together and - // eventually each MTask will wrap a large number of MTaskMoveVertices - // (and the logic nodes therein.) - const uint32_t totalGraphCost = setupMTaskDeps(mtasksp); - - V3Partition::debugMTaskGraphStats(mtasksp, "initial"); - - // For debug: print out the longest critical path. This allows us to - // verify that the costs look reasonable, that we aren't combining - // nodes that should probably be split, etc. - if (dumpLevel() >= 3) LogicMTask::dumpCpFilePrefixed(mtasksp, "cp"); - - // Merge nodes that could present data hazards; see comment within. - { - PartFixDataHazards{m_orderGraphp, mtasksp}.go(); - V3Partition::debugMTaskGraphStats(mtasksp, "hazards"); - hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()"); - } - - // Setup the critical path into and out of each node. - partInitCriticalPaths(mtasksp); - hashGraphDebug(mtasksp, "after partInitCriticalPaths()"); - - // Order the graph. We know it's already ranked from fixDataHazards() - // so we don't need to rank it again. - // - // On at least some models, ordering the graph here seems to help - // performance. (Why? Is it just triggering noise in a lucky direction? - // Is it just as likely to harm results?) - // - // More diversity of models that can build with --threads will - // eventually tell us. For now keep the order() so we don't forget - // about it, in case it actually helps. TODO: get more data and maybe - // remove this later if it doesn't really help. - mtasksp->orderPreRanked(); - - const int targetParFactor = v3Global.opt.threads(); - UASSERT(targetParFactor >= 2, "Should not reach V3Partition when --threads <= 1"); - - // Set cpLimit to roughly totalGraphCost / nThreads - // - // Actually set it a bit lower, by a hardcoded fudge factor. This - // results in more smaller mtasks, which helps reduce fragmentation - // when scheduling them. - const unsigned fudgeNumerator = 3; - const unsigned fudgeDenominator = 5; - const uint32_t cpLimit - = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator)); - UINFO(4, "V3Partition set cpLimit = " << cpLimit << endl); - - // Merge MTask nodes together, repeatedly, until the CP budget is - // reached. Coarsens the graph, usually by several orders of - // magnitude. - // - // Some tests disable this, hence the test on threadsCoarsen(). - // Coarsening is always enabled in production. - if (v3Global.opt.threadsCoarsen()) { - PartContraction{mtasksp, cpLimit, m_entryMTaskp, m_exitMTaskp, - // --debugPartition is used by tests - // to enable slow assertions. - v3Global.opt.debugPartition()} - .go(); - V3Partition::debugMTaskGraphStats(mtasksp, "contraction"); - } - { - mtasksp->removeTransitiveEdges(); - V3Partition::debugMTaskGraphStats(mtasksp, "transitive1"); - } - - // Reassign MTask IDs onto smaller numbers, which should be more stable - // across small logic changes. Keep MTask IDs in the same relative - // order though, otherwise we break CmpLogicMTask for still-existing - // EdgeSet's that haven't destructed yet. - { - using SortedMTaskSet = std::set; - SortedMTaskSet sorted; - for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - LogicMTask* const mtaskp = static_cast(itp); - sorted.insert(mtaskp); + // Add the entry/exit edges + if (mtaskp->inEmpty()) new MTaskEdge{m_mTaskGraphp.get(), m_entryMTaskp, mtaskp, 1}; + if (mtaskp->outEmpty()) new MTaskEdge{m_mTaskGraphp.get(), mtaskp, m_exitMTaskp, 1}; } - for (auto it = sorted.begin(); it != sorted.end(); ++it) { - // We shouldn't perturb the sort order of the set, despite - // changing the IDs, they should all just remain in the same - // relative order. Confirm that: - const uint32_t nextId = v3Global.rootp()->allocNextMTaskID(); - UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here"); - UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n"); - (*it)->id(nextId); + + return totalGraphCost; + } + + // CONSTRUCTORS + Partitioner(const OrderGraph& orderGraph, const V3Graph& fineDepsGraph) + : m_fineDepsGraph{fineDepsGraph} { + // Fill in the m_mTaskGraphp with LogicMTask's and their interdependencies. + + // Called by V3Order + hashGraphDebug(m_fineDepsGraph, "v3partition initial fine-grained deps"); + + // Create the first MTasks. Initially, each MTask just wraps one + // MTaskMoveVertex. Over time, we'll merge MTasks together and + // eventually each MTask will wrap a large number of MTaskMoveVertices + // (and the logic nodes therein.) + const uint32_t totalGraphCost = setupMTaskDeps(); + + debugMTaskGraphStats(*m_mTaskGraphp, "initial"); + + // For debug: print out the longest critical path. This allows us to + // verify that the costs look reasonable, that we aren't combining + // nodes that should probably be split, etc. + if (dumpLevel() >= 3) LogicMTask::dumpCpFilePrefixed(*m_mTaskGraphp, "cp"); + + // Merge nodes that could present data hazards; see comment within. + FixDataHazards::apply(orderGraph, *m_mTaskGraphp); + debugMTaskGraphStats(*m_mTaskGraphp, "hazards"); + hashGraphDebug(*m_mTaskGraphp, "mTaskGraphpp after fixDataHazards()"); + + // Setup the critical path into and out of each node. + partInitCriticalPaths(*m_mTaskGraphp); + hashGraphDebug(*m_mTaskGraphp, "after partInitCriticalPaths()"); + + // Order the graph. We know it's already ranked from fixDataHazards() + // so we don't need to rank it again. + // + // On at least some models, ordering the graph here seems to help + // performance. (Why? Is it just triggering noise in a lucky direction? + // Is it just as likely to harm results?) + // + // More diversity of models that can build with --threads will + // eventually tell us. For now keep the order() so we don't forget + // about it, in case it actually helps. TODO: get more data and maybe + // remove this later if it doesn't really help. + m_mTaskGraphp->orderPreRanked(); + + // Merge MTask nodes together, repeatedly, until the CP budget is + // reached. Coarsens the graph, usually by several orders of + // magnitude. + // + // Some tests disable this, hence the test on threadsCoarsen(). + // Coarsening is always enabled in production. + if (v3Global.opt.threadsCoarsen()) { + const int targetParFactor = v3Global.opt.threads(); + UASSERT(targetParFactor >= 2, "Should not reach Partitioner when --threads <= 1"); + + // Set cpLimit to roughly totalGraphCost / nThreads + // + // Actually set it a bit lower, by a hardcoded fudge factor. This + // results in more smaller mTaskGraphp, which helps reduce fragmentation + // when scheduling them. + const unsigned fudgeNumerator = 3; + const unsigned fudgeDenominator = 5; + const uint32_t cpLimit + = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator)); + UINFO(4, "Partitioner set cpLimit = " << cpLimit << endl); + + Contraction::apply(*m_mTaskGraphp, cpLimit, m_entryMTaskp, m_exitMTaskp, + // --debugPartition is used by tests + // to enable slow assertions. + v3Global.opt.debugPartition()); + debugMTaskGraphStats(*m_mTaskGraphp, "contraction"); + } + + m_mTaskGraphp->removeTransitiveEdges(); + debugMTaskGraphStats(*m_mTaskGraphp, "transitive1"); + + // Reassign MTask IDs onto smaller numbers, which should be more stable + // across small logic changes. Keep MTask IDs in the same relative + // order though, otherwise we break CmpLogicMTask for still-existing + // EdgeSet's that haven't destructed yet. + { + using SortedMTaskSet = std::set; + SortedMTaskSet sorted; + for (V3GraphVertex* itp = m_mTaskGraphp->verticesBeginp(); itp; + itp = itp->verticesNextp()) { + LogicMTask* const mtaskp = static_cast(itp); + sorted.insert(mtaskp); + } + for (auto it = sorted.begin(); it != sorted.end(); ++it) { + // We shouldn't perturb the sort order of the set, despite + // changing the IDs, they should all just remain in the same + // relative order. Confirm that: + const uint32_t nextId = v3Global.rootp()->allocNextMTaskID(); + UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here"); + UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n"); + (*it)->id(nextId); + } + } + + // Set color to indicate an mtaskId on every underlying MTaskMoveVertex. + for (V3GraphVertex* itp = m_mTaskGraphp->verticesBeginp(); itp; + itp = itp->verticesNextp()) { + const LogicMTask* const mtaskp = static_cast(itp); + for (MTaskMoveVertex* const mvertexp : mtaskp->vertexList()) { + mvertexp->color(mtaskp->id()); + } } } + ~Partitioner() = default; + VL_UNCOPYABLE(Partitioner); + VL_UNMOVABLE(Partitioner); - // Set color to indicate an mtaskId on every underlying MTaskMoveVertex. - for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - const LogicMTask* const mtaskp = static_cast(itp); - for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); - it != mtaskp->vertexListp()->end(); ++it) { - MTaskMoveVertex* const mvertexp = *it; - mvertexp->color(mtaskp->id()); - } +public: + static std::unique_ptr apply(const OrderGraph& orderGraph, + const V3Graph& fineDepsGraph) { + return std::move(Partitioner{orderGraph, fineDepsGraph}.m_mTaskGraphp); } -} - -void V3Order::selfTestParallel() { - UINFO(2, __FUNCTION__ << ": " << endl); - PartPropagateCpSelfTest::selfTest(); - PartContraction::selfTest(); -} +}; // Sort MTaskMoveVertex vertices by domain, then by scope, based on teh order they are encountered class OrderVerticesByDomainThenScope final { @@ -2494,28 +2426,25 @@ public: } }; -// Sort AbstractMTask vertices by their serial IDs. +// Sort LogicMTask vertices by their serial IDs. struct MTaskVxIdLessThan final { bool operator()(const V3GraphVertex* lhsp, const V3GraphVertex* rhsp) const { - return lhsp->as()->id() < rhsp->as()->id(); + return lhsp->as()->id() < rhsp->as()->id(); } }; -AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string& tag, +AstExecGraph* V3Order::createParallel(const OrderGraph& orderGraph, const std::string& tag, const TrigToSenMap& trigToSen, bool slow) { UINFO(2, " Constructing parallel code for '" + tag + "'"); // For nondeterminism debug: - V3Partition::hashGraphDebug(&graph, "V3Order's m_graph"); + hashGraphDebug(orderGraph, "V3OrderParallel's input OrderGraph"); - // We already produced a graph of every var, input, and logic - // block and all dependencies; this is 'm_graph'. - // - // Now, starting from m_graph, make a slightly-coarsened graph representing + // Starting from the orderGraph, make a slightly-coarsened graph representing // only logic, and discarding edges we know we can ignore. // This is quite similar to the 'm_pomGraph' of the serial code gen: const std::unique_ptr logicGraphp - = V3OrderMoveGraphBuilder::apply(graph, trigToSen); + = V3OrderMoveGraphBuilder::apply(orderGraph, trigToSen); // Needed? We do this for m_pomGraph in serial mode, so do it here too: logicGraphp->removeRedundantEdgesMax(&V3GraphEdge::followAlwaysTrue); @@ -2523,19 +2452,15 @@ AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string // Partition logicGraph into LogicMTask's. The partitioner will annotate // each vertex in logicGraph with a 'color' which is really an mtask ID // in this context. - V3Partition partitioner{&graph, logicGraphp.get()}; - V3Graph mtasks; - partitioner.go(&mtasks); + const std::unique_ptr mTaskGraphp = Partitioner::apply(orderGraph, *logicGraphp); - // processMTask* routines schedule threaded execution struct MTaskState final { AstMTaskBody* m_mtaskBodyp = nullptr; - std::list m_logics; + std::vector m_logics; ExecMTask* m_execMTaskp = nullptr; - MTaskState() = default; }; - std::unordered_map mtaskStates; + std::unordered_map mtaskStates; // Iterate through the entire logicGraph. For each logic node, // attach it to a per-MTask ordered list of logic nodes. @@ -2556,37 +2481,32 @@ AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string // Since we happen to be iterating over every logic node, // take this opportunity to annotate each AstVar with the id's - // of mtasks that consume it and produce it. We'll use this + // of mTaskGraphp that consume it and produce it. We'll use this // information in V3EmitC when we lay out var's in memory. const OrderLogicVertex* const logicp = movep->logicp(); for (const V3GraphEdge* edgep = logicp->inBeginp(); edgep; edgep = edgep->inNextp()) { - const OrderVarVertex* const pre_varp = edgep->fromp()->cast(); - if (!pre_varp) continue; - AstVar* const varp = pre_varp->vscp()->varp(); - // varp depends on logicp, so logicp produces varp, - // and vice-versa below - varp->addProducingMTaskId(mtaskId); + const OrderVarVertex* const vVtxp = edgep->fromp()->cast(); + if (!vVtxp) continue; + vVtxp->vscp()->varp()->addMTaskId(mtaskId); } for (const V3GraphEdge* edgep = logicp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const OrderVarVertex* const post_varp = edgep->top()->cast(); - if (!post_varp) continue; - AstVar* const varp = post_varp->vscp()->varp(); - varp->addConsumingMTaskId(mtaskId); + const OrderVarVertex* const vVtxp = edgep->top()->cast(); + if (!vVtxp) continue; + vVtxp->vscp()->varp()->addMTaskId(mtaskId); } - // TODO? We ignore IO vars here, so those will have empty mtask - // signatures. But we could also give those mtask signatures. } // Create the AstExecGraph node which represents the execution // of the MTask graph. FileLine* const rootFlp = v3Global.rootp()->fileline(); AstExecGraph* const execGraphp = new AstExecGraph{rootFlp, tag}; + V3Graph* const depGraphp = execGraphp->depGraphp(); // Create CFuncs and bodies for each MTask. V3OrderCFuncEmitter emitter{tag, slow}; - GraphStream mtaskStream{&mtasks}; + GraphStream mtaskStream{mTaskGraphp.get()}; while (const V3GraphVertex* const vtxp = mtaskStream.nextp()) { - const AbstractLogicMTask* const mtaskp = vtxp->as(); + const LogicMTask* const mtaskp = vtxp->as(); // Create a body for this mtask AstMTaskBody* const bodyp = new AstMTaskBody{rootFlp}; @@ -2604,7 +2524,6 @@ AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string // and OrderLogicVertex's which are ephemeral to V3Order. // - The ExecMTask graph and the AstMTaskBody's produced here // persist until code generation time. - V3Graph* const depGraphp = execGraphp->depGraphp(); state.m_execMTaskp = new ExecMTask{depGraphp, bodyp, mtaskp->id()}; // Cross-link each ExecMTask and MTaskBody // Q: Why even have two objects? @@ -2613,8 +2532,7 @@ AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp); for (V3GraphEdge* inp = mtaskp->inBeginp(); inp; inp = inp->inNextp()) { const V3GraphVertex* fromVxp = inp->fromp(); - const AbstractLogicMTask* const fromp - = static_cast(fromVxp); + const LogicMTask* const fromp = fromVxp->as(); const MTaskState& fromState = mtaskStates[fromp->id()]; new V3GraphEdge{depGraphp, fromState.m_execMTaskp, state.m_execMTaskp, 1}; } @@ -2623,3 +2541,10 @@ AstExecGraph* V3Order::createParallel(const OrderGraph& graph, const std::string return execGraphp; } + +void V3Order::selfTestParallel() { + UINFO(2, __FUNCTION__ << ": " << endl); + PropagateCp::selfTest(); + PropagateCp::selfTest(); + Contraction::selfTest(); +} diff --git a/src/V3PartitionGraph.h b/src/V3PartitionGraph.h index 04eed0136..251e07cc6 100644 --- a/src/V3PartitionGraph.h +++ b/src/V3PartitionGraph.h @@ -25,37 +25,11 @@ #include -class MTaskMoveVertex; - //************************************************************************* // MTasks and graph structures -class AbstractMTask VL_NOT_FINAL : public V3GraphVertex { - VL_RTTI_IMPL(AbstractMTask, V3GraphVertex) -public: - explicit AbstractMTask(V3Graph* graphp) VL_MT_DISABLED : V3GraphVertex{graphp} {} - ~AbstractMTask() override = default; - virtual uint32_t id() const = 0; - virtual uint32_t cost() const = 0; -}; - -class AbstractLogicMTask VL_NOT_FINAL : public AbstractMTask { - VL_RTTI_IMPL(AbstractLogicMTask, AbstractMTask) -public: - // TYPES - using VxList = std::list; - // CONSTRUCTORS - explicit AbstractLogicMTask(V3Graph* graphp) VL_MT_DISABLED : AbstractMTask{graphp} {} - ~AbstractLogicMTask() override = default; - // METHODS - // Set of logic vertices in this mtask. Order is not significant. - virtual const VxList* vertexListp() const = 0; - uint32_t id() const override = 0; // Unique id of this mtask. - uint32_t cost() const override = 0; -}; - -class ExecMTask final : public AbstractMTask { - VL_RTTI_IMPL(ExecMTask, AbstractMTask) +class ExecMTask final : public V3GraphVertex { + VL_RTTI_IMPL(ExecMTask, V3GraphVertex) private: AstMTaskBody* const m_bodyp; // Task body const uint32_t m_id; // Unique id of this mtask. @@ -71,23 +45,19 @@ private: public: ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id) VL_MT_DISABLED - : AbstractMTask{graphp}, + : V3GraphVertex{graphp}, m_bodyp{bodyp}, m_id{id} {} AstMTaskBody* bodyp() const { return m_bodyp; } - uint32_t id() const override VL_MT_SAFE { return m_id; } + uint32_t id() const VL_MT_SAFE { return m_id; } uint32_t priority() const { return m_priority; } void priority(uint32_t pri) { m_priority = pri; } - uint32_t cost() const override { return m_cost; } + uint32_t cost() const { return m_cost; } void cost(uint32_t cost) { m_cost = cost; } void predictStart(uint64_t time) { m_predictStart = time; } uint64_t predictStart() const { return m_predictStart; } void profilerId(uint64_t id) { m_profilerId = id; } uint64_t profilerId() const { return m_profilerId; } - string cFuncName() const { - // If this MTask maps to a C function, this should be the name - return "__Vmtask"s + "__" + cvtToStr(m_id); - } string name() const override VL_MT_STABLE { return "mt"s + cvtToStr(id()); } string hashName() const { return m_hashName; } void hashName(const string& name) { m_hashName = name; } @@ -96,6 +66,7 @@ public: if (priority() || cost()) str << " [pr=" << priority() << " c=" << cvtToStr(cost()) << "]"; } }; + inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) { rhs.dump(os); return os;