// -*- mode: C++; c-file-style: "cc-mode" -*- //************************************************************************* // DESCRIPTION: Verilator: Threading's logic to mtask partitioner // // Code available from: http://www.veripool.org/verilator // //************************************************************************* // // Copyright 2003-2018 by Wilson Snyder. This program is free software; you can // redistribute it and/or modify it under the terms of either the GNU // Lesser General Public License Version 3 or the Perl Artistic License // Version 2.0. // // Verilator is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // //************************************************************************* #include "config_build.h" #include "verilatedos.h" #include #include #include #include "V3Os.h" #include "V3File.h" #include "V3GraphAlg.h" #include "V3GraphPathChecker.h" #include "V3GraphStream.h" #include "V3InstrCount.h" #include "V3Partition.h" #include "V3PartitionGraph.h" #include "V3Scoreboard.h" #include "V3Stats.h" #include VL_INCLUDE_UNORDERED_SET class MergeCandidate; //###################################################################### // Partitioner tunable settings: // // Before describing these settings, a bit of background: // // Early during the development of the partitioner, V3Split was failing to // split large always blocks (with ~100K assignments) so we had to handle // very large vertices with ~100K incoming and outgoing edges. // // The partitioner attempts to deal with such densely connected // graphs. Some of the tuning parameters below reference "huge vertices", // that's what they're talking about, vertices with tens of thousands of // edges in and out. Whereas most graphs have only tens of edges in and out // of most vertices. // // V3Split has since been fixed to more reliably split large always // blocks. It's kind of an open question whether the partitioner must // handle huge nodes gracefully. Maybe not! But it still can, given // appropriate tuning. // PART_SIBLING_EDGE_LIMIT (integer) // // Arbitrarily limit the number of edges on a single vertex that will be // considered when enumerating siblings, to the given value. This protects // the partitioner runtime in the presence of huge vertices. // // The sibling-merge is less important than the edge merge. (You can // totally disable the sibling merge and get halfway decent partitions; you // can't disable edge merges, those are fundamental to the process.) So, // skipping the enumeration of some siblings on a few vertices does not // have a large impact on the result of the partitioner. // // If your vertices are small, the limit (at 25) approaches a no-op. Hence // there's basically no cost to applying this limit even when we don't // expect huge vertices. // // If you don't care about partitioner runtime and you want the most // aggressive partition, set the limit very high. If you have huge // vertices, leave this as is. #define PART_SIBLING_EDGE_LIMIT 25 // PART_STEPPED_COST (boolean) // // When computing critical path costs, use a step function on the actual // underlying vertex cost. // // If there are huge vertices, when a tiny vertex merges into a huge // vertex, we can often avoid increasing the huge vertex's stepped cost. // If the stepped cost hasn't increased, and the critical path into the huge // vertex hasn't increased, we can avoid propagating a new critical path to // vertices past the huge vertex. Since huge vertices tend to have huge lists // of children and parents, this can be a substantial savings. // // Does not seem to reduce the quality of the partitioner's output. // // If you have huge vertices, leave this 'true', it is the major setting // that allows the partitioner to handle such difficult graphs on anything // like a human time scale. // // If you don't have huge vertices, the 'true' value doesn't help much but // should cost almost nothing in terms of partitioner quality. // // If you want the most aggressive possible partition, set it "false" and // be prepared to be dissappointed when the improvement in the partition is // negligible / in the noise. // // Q) Why retain the control, if there is really no downside? // // A) Cost stepping can lead to corner cases. A developer may wish to // disable cost stepping to rule it out as the cause of unexpected // behavior. #define PART_STEPPED_COST true // PART_STEPPED_RESCORE_LIMIT (boolean) // // If false, we always try to merge the absolute lowest (best) scoring // mtask pair among all candidates. // // If true, we're willing to merge mtask pairs with scores up to 5% higher // (worse) than the best, in exchange for doing a Rescore() operation // somewhat less often. // // A true setting can result in a much faster compile in the presence of // huge vertices, eg. 45 minutes versus 4.5 minutes for one particular // model. HOWEVER, a true setting usually results in modestly worse // partitions, often around 10% more MTasks and 10% longer cycle times. // // (TODO: Why does this setting save time with huge vertices? // Is there a way to get best of both worlds without the trade off?) // // If you have huge vertices, you may wish to set this true. If you don't // have huge vertices (which should be everyone, we think, now that V3Split // is fixed) leave it set false for the most aggressive partition. #define PART_STEPPED_RESCORE_LIMIT false // Don't produce more than a certain maximum number of MTasks. This helps // the TSP variable sort not to blow up (a concern for some of the tests) // and we probably don't want a huge number of mtasks in practice anyway // (50 to 100 is typical.) // // If the user doesn't give one with '--threads-max-mtasks', we'll set the // maximum # of MTasks to // (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD) #define PART_DEFAULT_MAX_MTASKS_PER_THREAD 50 // end tunables. //###################################################################### // Misc graph and assertion utilities static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) { #if PART_STEPPED_COST // Cached CP might be a little bigger than actual, due to stepped CPs. // Example: // Let's say we have a parent with stepped_cost 40 and a grandparent // with stepped_cost 27. Our forward-cp is 67. Then our parent and // grandparent get merged, the merged node has stepped cost 66. We // won't propagate that new CP to children as it hasn't grown. So, // children may continue to think that the CP coming through this path // is a little higher than it really is; permit that. UASSERT((((cached * 10) <= (actual * 11)) && (cached * 11) >= (actual * 10)), "Calculation error in scoring (approximate, may need tweak)"); #else UASSERT(cached == actual, "Calculation error in scoring"); #endif } //###################################################################### // PartPropagateCp // Propagate increasing critical path (CP) costs through a graph. // // Usage: // * Client increases the cost and/or CP at a node or small set of nodes // (often a pair in practice, eg. edge contraction.) // * Client instances a PartPropagateCp object // * Client calls PartPropagateCp::cpHasIncreased() one or more times. // Each call indicates that the inclusive CP of some "seed" vertex // has increased to a given value. // * NOTE: PartPropagateCp will neither read nor modify the cost // or CPs at the seed vertices, it only accesses and modifies // vertices wayward from the seeds. // * Client calls PartPropagateCp::go(). Internally, this iteratively // propagates the new CPs wayward through the graph. // template class PartPropagateCp : GraphAlg<> { private: // MEMBERS GraphWay m_way; // CPs oriented in this direction: either FORWARD // // from graph-start to current node, or REVERSE // // from graph-end to current node. T_CostAccessor* m_accessp; // Access cost and CPs on V3GraphVertex's. vluint64_t m_generation; // Mark each vertex with this number; // // confirm we only process each vertex once. bool m_slowAsserts; // Enable nontrivial asserts typedef SortByValueMap PropCpPendSet; PropCpPendSet m_pending; // Pending rescores public: // CONSTRUCTORS PartPropagateCp(V3Graph* graphp, GraphWay way, T_CostAccessor* accessp, bool slowAsserts, V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue) : GraphAlg<>(graphp, edgeFuncp) , m_way(way) , m_accessp(accessp) , m_generation(0) , m_slowAsserts(slowAsserts) {} // METHODS void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) { // For *vxp, whose CP-inclusive has just increased to // newInclusiveCp, iterate to all wayward nodes, update the edges // of each, and add each to m_pending if its overall CP has grown. for (V3GraphEdge* edgep = vxp->beginp(m_way); edgep; edgep = edgep->nextp(m_way)) { if (!m_edgeFuncp(edgep)) continue; V3GraphVertex* relativep = edgep->furtherp(m_way); m_accessp->notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp); if (m_accessp->critPathCost(relativep, m_way) < newInclusiveCp) { // relativep's critPathCost() is out of step with its // longest !wayward edge. Schedule that to be resolved. uint32_t newPendingVal = newInclusiveCp - m_accessp->critPathCost(relativep, m_way); if (m_pending.has(relativep)) { if (newPendingVal > m_pending.at(relativep)) { m_pending.set(relativep, newPendingVal); } } else { m_pending.set(relativep, newPendingVal); } } } } void go() { // m_pending maps each pending vertex to the amount that it wayward // CP will grow. // // We can iterate over the pending set in reverse order, always // choosing the nodes with the largest pending CP-growth. // // The intuition is: if the original seed node had its CP grow by // 50, the most any wayward node can possibly grow is also 50. So // for anything pending to grow by 50, we know we can process it // once and we won't have to grow its CP again on the current pass. // After we're done with all the grow-by-50s, nothing else will // grow by 50 again on the current pass, and we can process the // grow-by-49s and we know we'll only have to process each one // once. And so on. // // This generalizes to multiple seed nodes also. while (!m_pending.empty()) { PropCpPendSet::reverse_iterator it = m_pending.rbegin(); V3GraphVertex* updateMep = (*it).key(); uint32_t cpGrowBy = (*it).value(); m_pending.erase(it); // For *updateMep, whose critPathCost was out-of-date with respect // to its edges, update the critPathCost. uint32_t startCp = m_accessp->critPathCost(updateMep, m_way); uint32_t newCp = startCp + cpGrowBy; if (m_slowAsserts) { m_accessp->checkNewCpVersusEdges(updateMep, m_way, newCp); } m_accessp->setCritPathCost(updateMep, m_way, newCp); cpHasIncreased(updateMep, newCp + m_accessp->cost(updateMep)); } } private: VL_DEBUG_FUNC; VL_UNCOPYABLE(PartPropagateCp); }; class PartPropagateCpSelfTest { private: // MEMBERS V3Graph m_graph; // A graph V3GraphVertex* m_vx[50]; // All vertices within the graph typedef vl_unordered_map CpMap; CpMap m_cp; // Vertex-to-CP map CpMap m_seen; // Set of vertices we've seen // CONSTRUCTORS PartPropagateCpSelfTest() {} ~PartPropagateCpSelfTest() {} // METHODS protected: friend class PartPropagateCp; void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throughp, uint32_t cp) const { uint32_t throughCost = critPathCost(throughp, way); UASSERT_SELFTEST(uint32_t, cp, (1 + throughCost)); } private: void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const { // Don't need to check this in the self test; it supports an assert // that runs in production code. } void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) { m_cp[vxp] = cost; // Confirm that we only set each node's CP once. That's an // important property of PartPropagateCp which allows it to be far // faster than a recursive algorithm on some graphs. CpMap::iterator it = m_seen.find(vxp); if (it != m_seen.end()) vxp->v3fatalSrc("Set CP on node twice"); m_seen[vxp] = cost; } uint32_t critPathCost(V3GraphVertex* vxp, GraphWay way) const { CpMap::const_iterator it = m_cp.find(vxp); if (it != m_cp.end()) return it->second; return 0; } uint32_t cost(const V3GraphVertex*) const { return 1; } void partInitCriticalPaths(bool checkOnly) { // Set up the FORWARD cp's only. This test only looks in one // direction, it assumes REVERSE is symmetrical and would be // redundant to test. GraphStreamUnordered order(&m_graph); while (const V3GraphVertex* cvxp = order.nextp()) { V3GraphVertex* vxp = const_cast(cvxp); uint32_t cpCost = 0; for (V3GraphEdge* edgep = vxp->inBeginp(); edgep; edgep = edgep->inNextp()) { V3GraphVertex* parentp = edgep->fromp(); cpCost = std::max(cpCost, critPathCost(parentp, GraphWay::FORWARD) + 1); } if (checkOnly) { UASSERT_SELFTEST(uint32_t, cpCost, critPathCost(vxp, GraphWay::FORWARD)); } else { setCritPathCost(vxp, GraphWay::FORWARD, cpCost); } } } void go() { // Generate a pseudo-random graph uint16_t rngState[3] = { 0xdead, 0xbeef, 0xf000 }; // Create 50 vertices for (unsigned i = 0; i < 50; ++i) { m_vx[i] = new V3GraphVertex(&m_graph); } // Create 250 edges at random. Edges must go from // lower-to-higher index vertices, so we get a DAG. for (unsigned i = 0; i < 250; ++i) { unsigned idx1 = nrand48(rngState) % 50; unsigned idx2 = nrand48(rngState) % 50; if (idx1 > idx2) { new V3GraphEdge(&m_graph, m_vx[idx2], m_vx[idx1], 1); } else if (idx2 > idx1) { new V3GraphEdge(&m_graph, m_vx[idx1], m_vx[idx2], 1); } } partInitCriticalPaths(false); // This SelfTest class is also the T_CostAccessor PartPropagateCp prop(&m_graph, GraphWay::FORWARD, this, true); // Seed the propagator with every input node; // This should result in the complete graph getting all CP's assigned. for (unsigned i = 0; i < 50; ++i) { if (!m_vx[i]->inBeginp()) { prop.cpHasIncreased(m_vx[i], 1 /* inclusive CP starts at 1 */); } } // Run the propagator. // * The setCritPathCost() routine checks that each node's CP changes // at most once. // * The notifyEdgeCp routine is also self checking. m_seen.clear(); prop.go(); // Finally, confirm that the entire graph appears to have correct CPs. partInitCriticalPaths(true); } public: static void selfTest() { PartPropagateCpSelfTest().go(); } }; //###################################################################### // LogicMTask class LogicMTask : public AbstractLogicMTask { public: // TYPES typedef std::list VxList; struct CmpLogicMTask { bool operator() (const LogicMTask* ap, const LogicMTask* bp) const { return ap->id() < bp->id(); } }; // This adaptor class allows the PartPropagateCp class to be somewhat // independent of the LogicMTask class // - PartPropagateCp can thus be declared before LogicMTask // - PartPropagateCp could be reused with graphs of other node types // in the future, using another Accessor adaptor. class CpCostAccessor { public: CpCostAccessor() {} ~CpCostAccessor() {} // Return cost of this node uint32_t cost(const V3GraphVertex* vxp) const { const LogicMTask* mtaskp = dynamic_cast(vxp); return mtaskp->stepCost(); } // Return stored CP to this node uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const { const LogicMTask* mtaskp = dynamic_cast(vxp); return mtaskp->critPathCost(way); } // Store a new CP to this node void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) const { LogicMTask* mtaskp = dynamic_cast(vxp); mtaskp->setCritPathCost(way, cost); } // Notify vxp that the wayward CP at the throughp-->vxp edge // has increased to 'cp'. (vxp is wayward from throughp.) // This is our cue to update vxp's m_edges[!way][throughp]. void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throuvhVxp, uint32_t cp) const { LogicMTask* updateVxp = dynamic_cast(vxp); LogicMTask* lthrouvhVxp = dynamic_cast(throuvhVxp); EdgeSet& edges = updateVxp->m_edges[way.invert()]; uint32_t edgeCp = edges.at(lthrouvhVxp); if (cp > edgeCp) edges.set(lthrouvhVxp, cp); } // Check that CP matches that of the longest edge wayward of vxp. void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const { LogicMTask* mtaskp = dynamic_cast(vxp); EdgeSet& edges = mtaskp->m_edges[way.invert()]; // This is mtaskp's relative with longest !wayward inclusive CP: EdgeSet::reverse_iterator edgeIt = edges.rbegin(); uint32_t edgeCp = (*edgeIt).value(); if (edgeCp != cp) vxp->v3fatalSrc("CP doesn't match longest wayward edge"); } private: VL_UNCOPYABLE(CpCostAccessor); }; private: // MEMBERS // Set of MTaskMoveVertex's assigned to this mtask. LogicMTask does not // own the MTaskMoveVertex objects, we merely keep pointers to them // here. VxList m_vertices; // Cost estimate for this LogicMTask, derived from V3InstrCount. // In abstract time units. uint32_t m_cost; // Cost of critical paths going FORWARD from graph-start to the start // of this vertex, and also going REVERSE from the end of the graph to // the end of the vertex. Same units as m_cost. uint32_t m_critPathCost[GraphWay::NUM_WAYS]; uint32_t m_serialId; // Unique MTask ID number // Count "generations" which are just operations that scan through the // graph. We'll mark each node with the last generation that scanned // it. We can use this to avoid recursing through the same node twice // while searching for a path. vluint64_t m_generation; // Redundant with the V3GraphEdge's, store a map of relatives so we can // quickly check if we have a given parent or child. // // 'm_edges[way]' maps a wayward relative to the !way critical path at // our edge with them. The SortByValueMap supports iterating over // relatives in longest-to-shortest CP order. We rely on this ordering // in more than one place. typedef SortByValueMap EdgeSet; EdgeSet m_edges[GraphWay::NUM_WAYS]; public: // CONSTRUCTORS LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) : AbstractLogicMTask(graphp) , m_cost(0) , m_generation(0) { for (int i=0; ilogicp()) { m_cost += V3InstrCount::count(olvp->nodep(), true); } } // Start at 1, so that 0 indicates no mtask ID. static uint32_t s_nextId = 1; m_serialId = s_nextId++; UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks"); } // METHODS void moveAllVerticesFrom(LogicMTask* otherp) { // splice() is constant time m_vertices.splice(m_vertices.end(), otherp->m_vertices); m_cost += otherp->m_cost; } virtual const VxList* vertexListp() const { return &m_vertices; } static vluint64_t incGeneration() { static vluint64_t s_generation = 0; ++s_generation; return s_generation; } // Use this instead of pointer-compares to compare LogicMTasks. Avoids // nondeterministic output. Also name mtasks based on this number in // the final C++ output. virtual uint32_t id() const { return m_serialId; } void id(uint32_t id) { m_serialId = id; } // Abstract cost of every logic mtask virtual uint32_t cost() const { return m_cost; } void setCost(uint32_t cost) { m_cost = cost; } // For tests only uint32_t stepCost() const { return stepCost(m_cost); } static uint32_t stepCost(uint32_t cost) { #if PART_STEPPED_COST // Round cost up to the nearest 5%. Use this when computing all // critical paths. The idea is that critical path changes don't // need to propagate when they don't exceed the next step, saving a // lot of recursion. if (cost == 0) return 0; double logcost = log(cost); // log(1.05) is about 0.05 // So, round logcost up to the next 0.05 boundary logcost *= 20.0; logcost = ceil(logcost); logcost = logcost / 20.0; uint32_t stepCost = (uint32_t)(exp(logcost)); UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); return stepCost; #else return cost; #endif } void addRelative(GraphWay way, LogicMTask* relativep) { EdgeSet& edges = m_edges[way]; UASSERT(!edges.has(relativep), "Adding existing edge"); // value is !way cp to this edge edges.set(relativep, relativep->stepCost() + relativep->critPathCost(way.invert())); } void removeRelative(GraphWay way, LogicMTask* relativep) { EdgeSet& edges = m_edges[way]; edges.erase(relativep); } bool hasRelative(GraphWay way, LogicMTask* relativep) { EdgeSet& edges = m_edges[way]; return edges.has(relativep); } void checkRelativesCp(GraphWay way) const { const EdgeSet& edges = m_edges[way]; for (EdgeSet::const_reverse_iterator it = edges.rbegin(); it != edges.rend(); ++it) { LogicMTask* relativep = (*it).key(); uint32_t cachedCp = (*it).value(); partCheckCachedScoreVsActual (cachedCp, relativep->critPathCost(way.invert()) + relativep->stepCost()); } } virtual string name() const { // Display forward and reverse critical path costs. This gives a quick // read on whether graph partitioning looks reasonable or bad. std::ostringstream out; out <<"mt"<furtherp(way), "In critPathCostWithout(), edge 'withoutp' must " "further to 'this'"); // Iterate through edges until we get a relative other than // wayEdgeEndp(way, withoutp). This should take 2 iterations max. const EdgeSet& edges = m_edges[way.invert()]; uint32_t result = 0; for (EdgeSet::const_reverse_iterator it = edges.rbegin(); it != edges.rend(); ++it) { if ((*it).key() != withoutp->furtherp(way.invert())) { // Use the cached cost. It could be a small overestimate // due to stepping. This is consistent with critPathCost() // which also returns the cached cost. result = (*it).value(); break; } } return result; } private: static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top, const V3GraphEdge* excludedEdgep, vluint64_t generation) { // Q) Why does this take LogicMTask instead of generic V3GraphVertex? // A) We'll use the critical paths known to LogicMTask to prune the // recursion for speed. Also store 'generation' in // LogicMTask::m_generation so we can prune the search and avoid // recursing through the same node more than once in a single // search. if (fromp->m_generation == generation) { // Already looked at this node in the current search. // Since we're back again, we must not have found a path on the // first go. return false; } fromp->m_generation = generation; // Base case: we found a path. if (fromp == top) return true; // Base case: fromp is too late, cannot possibly be a prereq for top. if (fromp->critPathCost(GraphWay::REVERSE) < (top->critPathCost(GraphWay::REVERSE) + top->stepCost())) return false; if ((fromp->critPathCost(GraphWay::FORWARD) + fromp->stepCost()) > top->critPathCost(GraphWay::FORWARD)) return false; // Recursively look for a path for (const V3GraphEdge* followp = fromp->outBeginp(); followp; followp = followp->outNextp()) { if (followp == excludedEdgep) continue; LogicMTask* nextp = dynamic_cast(followp->top()); if (pathExistsFromInternal(nextp, top, NULL, generation)) return true; } return false; } // True if there's a path from 'fromp' to 'top' excluding // 'excludedEdgep', false otherwise. // // 'excludedEdgep' may be NULL in which case no edge is excluded. If // 'excludedEdgep' is non-NULL it must connect fromp and top. // // TODO: consider changing this API to the 'isTransitiveEdge' API // used by GraphPathChecker public: static bool pathExistsFrom(LogicMTask* fromp, LogicMTask* top, const V3GraphEdge* excludedEdgep) { return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); } static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { string filename = v3Global.debugFilename(nameComment)+".txt"; UINFO(1,"Writing "< ofp(V3File::new_ofstream(filename)); std::ostream* osp = &(*ofp); // &* needed to deref unique_ptr if (osp->fail()) v3fatalStatic("Can't write "<verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { const LogicMTask* mtaskp = dynamic_cast(vxp); if (!startp) { startp = mtaskp; continue; } if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { startp = mtaskp; } } // Follow the entire critical path std::vector path; uint32_t totalCost = 0; for (const LogicMTask* nextp = startp; nextp;) { path.push_back(nextp); totalCost += nextp->cost(); const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD]; EdgeSet::const_reverse_iterator it = children.rbegin(); if (it == children.rend()) nextp = NULL; else nextp = (*it).key(); } *osp<<"totalCost = "<::iterator it = path.begin(); it != path.end(); ++it) { const LogicMTask* mtaskp = *it; *osp<<"begin mtask with cost "<cost()<vertexListp()->begin(); lit != mtaskp->vertexListp()->end(); ++lit) { const OrderLogicVertex* logicp = (*lit)->logicp(); if (!logicp) continue; if (0) { // Show nodes only *osp<<"> "; logicp->nodep()->dumpTree(*osp); } else { // Show nodes with hierarchical costs V3InstrCount::count(logicp->nodep(), false, osp); } } } } private: VL_DEBUG_FUNC; // Declare debug() VL_UNCOPYABLE(LogicMTask); }; //###################################################################### // MTask utility classes // Sort AbstractMTask objects into deterministic order by calling id() // which is a unique and stable serial number. class MTaskIdLessThan { public: MTaskIdLessThan() {} virtual ~MTaskIdLessThan() {} virtual bool operator() (const AbstractMTask* lhsp, const AbstractMTask* rhsp) const { return lhsp->id() < rhsp->id(); } }; // Information associated with scoreboarding an MTask class MergeCandidate { private: bool m_removedFromSb; // Not on scoreboard, generally ignore vluint64_t m_id; // Serial number for ordering public: // CONSTRUCTORS MergeCandidate() : m_removedFromSb(false) { static vluint64_t serial = 0; ++serial; m_id = serial; } virtual bool mergeWouldCreateCycle() const = 0; // METHODS bool removedFromSb() const { return m_removedFromSb; } void removedFromSb(bool removed) { m_removedFromSb = removed; } bool operator<(const MergeCandidate& other) const { return m_id < other.m_id; } }; // A pair of associated LogicMTask's that are merge candidates for sibling // contraction class SiblingMC : public MergeCandidate { private: LogicMTask* m_ap; LogicMTask* m_bp; // CONSTRUCTORS SiblingMC() VL_EQ_DELETE; public: SiblingMC(LogicMTask* ap, LogicMTask* bp) { // Assign 'ap' and 'bp' in a canonical order, so we can more easily // compare pairs of SiblingMCs if (ap->id() > bp->id()) { m_ap = ap; m_bp = bp; } else { m_ap = bp; m_bp = ap; } } virtual ~SiblingMC() {} // METHODS LogicMTask* ap() const { return m_ap; } LogicMTask* bp() const { return m_bp; } bool mergeWouldCreateCycle() const { return (LogicMTask::pathExistsFrom(m_ap, m_bp, NULL) || LogicMTask::pathExistsFrom(m_bp, m_ap, NULL)); } bool operator<(const SiblingMC& other) const { if (m_ap->id() < other.m_ap->id()) { return true; } if (m_ap->id() > other.m_ap->id()) { return false; } return m_bp->id() < other.m_bp->id(); } }; // GraphEdge for the MTask graph class MTaskEdge : public V3GraphEdge, public MergeCandidate { public: // CONSTRUCTORS MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight) : V3GraphEdge(graphp, fromp, top, weight), MergeCandidate() { fromp->addRelative(GraphWay::FORWARD, top); top->addRelative(GraphWay::REVERSE, fromp); } virtual ~MTaskEdge() { fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp()); toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp()); } // METHODS LogicMTask* furtherMTaskp(GraphWay way) const { return dynamic_cast(this->furtherp(way)); } LogicMTask* fromMTaskp() const { return dynamic_cast(fromp()); } LogicMTask* toMTaskp() const { return dynamic_cast(top()); } virtual bool mergeWouldCreateCycle() const { return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this); } static MTaskEdge* cast(V3GraphEdge* edgep) { if (!edgep) return NULL; MTaskEdge* resultp = dynamic_cast(edgep); UASSERT(resultp, "Failed to cast in MTaskEdge::cast"); return resultp; } // Following initial assignment of critical paths, clear this MTaskEdge // out of the edge-map for each node and reinsert at a new location // with updated critical path. void resetCriticalPaths() { LogicMTask* fromp = fromMTaskp(); LogicMTask* top = toMTaskp(); fromp->removeRelative(GraphWay::FORWARD, top); top->removeRelative(GraphWay::REVERSE, fromp); fromp->addRelative(GraphWay::FORWARD, top); top->addRelative(GraphWay::REVERSE, fromp); } private: VL_UNCOPYABLE(MTaskEdge); }; //###################################################################### // Vertex utility classes class OrderByPtrId { PartPtrIdMap m_ids; public: virtual bool operator() (const OrderVarStdVertex* lhsp, const OrderVarStdVertex* rhsp) const { vluint64_t l_id = m_ids.findId(lhsp); vluint64_t r_id = m_ids.findId(rhsp); return l_id < r_id; } }; //###################################################################### // PartParallelismEst - Estimate parallelism of graph class PartParallelismEst { // MEMBERS const V3Graph* m_graphp; // Mtask-containing graph // Total cost of evaluating the whole graph. // The ratio of m_totalGraphCost to longestCpCost gives us an estimate // of the parallelizability of this graph which is only as good as the // guess returned by LogicMTask::cost(). uint32_t m_totalGraphCost; // Cost of the longest critical path, in abstract units (the same units // returned by the vertexCost) uint32_t m_longestCpCost; size_t m_vertexCount; // Number of vertexes calculated size_t m_edgeCount; // Number of edges calculated public: // CONSTRUCTORS explicit PartParallelismEst(const V3Graph* graphp) : m_graphp(graphp), m_totalGraphCost(0), m_longestCpCost(0), m_vertexCount(0), m_edgeCount(0) {} // METHODS uint32_t totalGraphCost() const { return m_totalGraphCost; } uint32_t longestCritPathCost() const { return m_longestCpCost; } size_t vertexCount() const { return m_vertexCount; } size_t edgeCount() const { return m_edgeCount; } double parallelismFactor() const { return (static_cast(m_totalGraphCost) / m_longestCpCost); } void traverse() { // For each node, record the critical path cost from the start // of the graph through the end of the node. vl_unordered_map critPaths; GraphStreamUnordered serialize(m_graphp); for (const V3GraphVertex* vertexp; (vertexp = serialize.nextp());) { m_vertexCount++; uint32_t cpCostToHere = 0; for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) { ++m_edgeCount; // For each upstream item, add its critical path cost to // the cost of this edge, to form a new candidate critical // path cost to the current node. Whichever is largest is // the critical path to reach the start of this node. cpCostToHere = std::max(cpCostToHere, critPaths[edgep->fromp()]); } // Include the cost of the current vertex in the critical // path, so it represents the critical path to the end of // this vertex. cpCostToHere += vertexCost(vertexp); critPaths[vertexp] = cpCostToHere; m_longestCpCost = std::max(m_longestCpCost, cpCostToHere); // Tally the total cost contributed by vertices. m_totalGraphCost += vertexCost(vertexp); } } void statsReport(const string& stage) { V3Stats::addStat("MTask graph, "+stage+", critical path cost", m_longestCpCost); V3Stats::addStat("MTask graph, "+stage+", total graph cost", m_totalGraphCost); V3Stats::addStat("MTask graph, "+stage+", mtask count", m_vertexCount); V3Stats::addStat("MTask graph, "+stage+", edge count", m_edgeCount); V3Stats::addStat("MTask graph, "+stage+", parallelism factor", parallelismFactor()); } void debugReport() { UINFO(0, " Critical path cost = "<furtherp(rev)) != relatives.end()) { mtaskp->v3fatalSrc("Should be no redundant edges in mtasks graph"); } relatives.insert(edgep->furtherp(rev)); LogicMTask* relativep = dynamic_cast(edgep->furtherp(rev)); cpCost = std::max(cpCost, (relativep->critPathCost(way) + static_cast(relativep->stepCost()))); } if (checkOnly) { partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost); } else { mtaskp->setCritPathCost(way, cpCost); } } } // Look at vertex costs to form critical paths for each vertex. static void partInitCriticalPaths(V3Graph* mtasksp) { partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false); partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false); // Reset all MTaskEdges so that 'm_edges' will show correct CP numbers. // They would have been all zeroes on initial creation of the MTaskEdges. std::vector edges; for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { for (V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { MTaskEdge* mtedgep = dynamic_cast(edgep); mtedgep->resetCriticalPaths(); } } } // Do an EXPENSIVE check to make sure that all incremental CP updates have // gone correctly. static void partCheckCriticalPaths(V3Graph* mtasksp) { partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true); partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true); for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { LogicMTask* mtaskp = dynamic_cast(vxp); mtaskp->checkRelativesCp(GraphWay::FORWARD); mtaskp->checkRelativesCp(GraphWay::REVERSE); } } // Advance to nextp(way) and delete edge static V3GraphEdge* partBlastEdgep(GraphWay way, V3GraphEdge* edgep) { V3GraphEdge* nextp = edgep->nextp(way); edgep->unlinkDelete(); VL_DANGLING(edgep); return nextp; } // Merge edges from a LogicMtask. // // This code removes 'hasRelative' edges. When this occurs, mark it in need // of a rescore, in case its score has fallen and we need to move it up // toward the front of the scoreboard. // // Wait, whaaat? Shouldn't the scores only increase as we merge nodes? Well // that's almost true. But there is one exception. // // Suppose we have A->B, B->C, and A->C. // // The A->C edge is a "transitive" edge. It's ineligible to be merged, as // the merge would create a cycle. We score it on the scoreboard like any // other edge. // // However, our "score" estimate for A->C is bogus, because the forward // critical path to C and the reverse critical path to A both contain the // same node (B) so we overestimate the score of A->C. At first this // doesn't matter, since transitive edges aren't eligible to merge anyway. // // Later, suppose the edge contractor decides to merge the B->C edge, with // B donating all its incoming edges into C, say. (So we reach this // function.) // // With B going away, the A->C edge will no longer be transitive and it // will become eligible to merge. But if we don't mark it for rescore, // it'll stay in the scoreboard with its old (overestimate) score. We'll // merge it too late due to the bogus score. When we finally merge it, we // fail the assert in the main edge contraction loop which checks that the // actual score did not fall below the scoreboard's score. // // Another way of stating this: this code ensures that scores of // non-transitive edges only ever increase. static void partMergeEdgesFrom(V3Graph* mtasksp, LogicMTask* recipientp, LogicMTask* donorp, V3Scoreboard* sbp) { for (unsigned wi = 0; wi < 2; ++wi) { GraphWay way = wi ? GraphWay::REVERSE : GraphWay::FORWARD; for (V3GraphEdge* edgep = donorp->beginp(way); edgep; edgep = partBlastEdgep(way, edgep)) { MTaskEdge* tedgep = MTaskEdge::cast(edgep); if (sbp && !tedgep->removedFromSb()) sbp->removeElem(tedgep); // Existing edge; mark it in need of a rescore if (recipientp->hasRelative(way, tedgep->furtherMTaskp(way))) { if (sbp) { MTaskEdge* existMTaskEdgep = MTaskEdge::cast(recipientp->findConnectingEdgep (way, tedgep->furtherMTaskp(way))); UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); if (!existMTaskEdgep->removedFromSb()) { sbp->hintScoreChanged(existMTaskEdgep); } } } else { // No existing edge into *this, make one. MTaskEdge* newEdgep; if (way == GraphWay::REVERSE) { newEdgep = new MTaskEdge(mtasksp, tedgep->fromMTaskp(), recipientp, 1); } else { newEdgep = new MTaskEdge(mtasksp, recipientp, tedgep->toMTaskp(), 1); } if (sbp) sbp->addElem(newEdgep); } } } } //###################################################################### // PartContraction // Perform edge or sibling contraction on the partition graph class PartContraction { private: // TYPES // TODO: might get a little more speed by making this a // vl_unordered_set and defining hash and equal_to functors for the // SiblingMC: typedef std::set SibSet; typedef vl_unordered_set SibpSet; typedef vl_unordered_map MTask2Sibs; // New CP information for mtaskp reflecting an upcoming merge struct NewCp { uint32_t cp; uint32_t propagateCp; bool propagate; }; // MEMBERS V3Graph* m_mtasksp; // Mtask graph uint32_t m_scoreLimit; // Sloppy score allowed when picking merges uint32_t m_scoreLimitBeforeRescore; // Next score rescore at unsigned m_mergesSinceRescore; // Merges since last rescore bool m_slowAsserts; // Take extra time to validate algorithm V3Scoreboard m_sb; // Scoreboard SibSet m_pairs; // Storage for each SiblingMC MTask2Sibs m_mtask2sibs; // SiblingMC set for each mtask public: // CONSTRUCTORS PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts) : m_mtasksp(mtasksp) , m_scoreLimit(scoreLimit) , m_scoreLimitBeforeRescore(0xffffffff) , m_mergesSinceRescore(0) , m_slowAsserts(slowAsserts) , m_sb(&mergeCandidateScore, slowAsserts) { } // METHODS void go() { unsigned maxMTasks = v3Global.opt.threadsMaxMTasks(); if (maxMTasks == 0) { // Unspecified so estimate if (v3Global.opt.threads() > 1) { maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD * v3Global.opt.threads()); } else { // Running PartContraction with --threads <= 1 means self-test maxMTasks = 500; } } // OPTIMIZATION PASS: Edge contraction and sibling contraction. // - Score each pair of mtasks which is a candidate to merge. // * Each edge defines such a candidate pair // * Two mtasks that are prereqs or postreqs of a common third // vertex are "siblings", these are also a candidate pair. // - Build a list of MergeCandidates, sorted by score. // - Merge the best pair. // - Incrementally recompute critical paths near the merged mtask. for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { vl_unordered_set neighbors; for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep=edgep->outNextp()) { m_sb.addElem(MTaskEdge::cast(edgep)); if (neighbors.find(edgep->top()) != neighbors.end()) { itp->v3fatalSrc("Redundant edge found in input to PartContraction()"); } neighbors.insert(edgep->top()); } siblingPairFromRelatives(GraphWay::REVERSE, itp, true); siblingPairFromRelatives(GraphWay::FORWARD, itp, true); } doRescore(); // Set initial scores in scoreboard while (1) { // This is the best edge to merge, with the lowest // score (shortest local critical path) MergeCandidate* mergeCanp = const_cast(m_sb.bestp()); if (!mergeCanp) { // Scoreboard found no eligible merges. Maybe a rescore // will produce some merge-able pairs? if (m_sb.needsRescore()) { doRescore(); continue; } break; } if (m_slowAsserts) { UASSERT(!m_sb.needsRescore(mergeCanp), "Need-rescore items should not be returned by bestp"); } uint32_t cachedScore = m_sb.cachedScore(mergeCanp); uint32_t actualScore = mergeCandidateScore(mergeCanp); if (actualScore > cachedScore) { // Cached score is out-of-date. // Mark this elem as in need of a rescore and continue. m_sb.hintScoreChanged(mergeCanp); continue; } // ... we'll also confirm that actualScore hasn't shrunk relative // to cached score, after the mergeWouldCreateCycle() check. if (actualScore > m_scoreLimit) { // Our best option isn't good enough if (m_sb.needsRescore()) { // Some pairs need a rescore, maybe those will be // eligible to merge afterward. doRescore(); continue; } else { // We've exhausted everything below m_scoreLimit; stop. // Except, if we have too many mtasks, raise the score // limit and keep going... unsigned mtaskCount = 0; for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { ++mtaskCount; } if (mtaskCount > maxMTasks) { uint32_t oldLimit = m_scoreLimit; m_scoreLimit = (m_scoreLimit * 120) / 100; // Line must be >0 otherwise FileLine doesn't check // if the warning is suppressed with -Wno-UNOPTTHREADS FileLine dummyFl("AstRoot", 1); dummyFl.v3warn(UNOPTTHREADS, "Thread scheduler is unable to provide requested parallelism; consider asking for fewer threads."); UINFO(1,"Critical path limit was="<furtherp(way)) { newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCostWithout(way, mergeEdgep)); } else { newCp = std::max(mtaskp->critPathCost(way), otherp->critPathCostWithout(way, mergeEdgep)); } } else { newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way)); } uint32_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost(); uint32_t newRelativesCp = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost()); NewCp result; result.cp = newCp; result.propagate = (newRelativesCp > origRelativesCp); result.propagateCp = newRelativesCp; return result; } void removeSiblingMCsWith(LogicMTask* mtaskp) { for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin(); it != m_mtask2sibs[mtaskp].end(); ++it) { const SiblingMC* pairp = *it; if (!pairp->removedFromSb()) { m_sb.removeElem(pairp); } LogicMTask* otherp = (pairp->bp() == mtaskp) ? pairp->ap() : pairp->bp(); size_t erased = m_mtask2sibs[otherp].erase(pairp); if (erased <= 0) otherp->v3fatalSrc("Expected existing mtask"); erased = m_pairs.erase(*pairp); if (erased <= 0) mtaskp->v3fatalSrc("Expected existing mtask"); } size_t erased = m_mtask2sibs.erase(mtaskp); if (erased <= 0) mtaskp->v3fatalSrc("Expected existing mtask"); } void contract(MergeCandidate* mergeCanp) { LogicMTask *top = NULL; LogicMTask *fromp = NULL; MTaskEdge* mergeEdgep = dynamic_cast(mergeCanp); SiblingMC* mergeSibsp = NULL; if (mergeEdgep) { top = dynamic_cast(mergeEdgep->top()); fromp = dynamic_cast(mergeEdgep->fromp()); } else { mergeSibsp = dynamic_cast(mergeCanp); UASSERT(mergeSibsp, "Failed to cast mergeCanp to either MTaskEdge or SiblingMC"); top = mergeSibsp->ap(); fromp = mergeSibsp->bp(); } // Merge the smaller mtask into the larger mtask. If one of them // is much larger, this will save time in partMergeEdgesFrom(). // Assume the more costly mtask has more edges. // // [TODO: now that we have edge maps, we could count the edges // exactly without a linear search.] LogicMTask* recipientp; LogicMTask* donorp; if (fromp->cost() > top->cost()) { recipientp = fromp; donorp = top; } else { donorp = fromp; recipientp = top; } fromp = top = NULL; // Use donorp and recipientp now instead // Recursively update forward and reverse CP numbers. // // Doing this before merging the mtasks lets us often avoid // recursing through either incoming or outgoing edges on one or // both mtasks. // // These 'NewCp' objects carry a bit indicating whether we must // propagate CP for each of the four cases: NewCp recipientNewCpFwd = newCp(GraphWay::FORWARD, recipientp, donorp, mergeEdgep); NewCp donorNewCpFwd = newCp(GraphWay::FORWARD, donorp, recipientp, mergeEdgep); NewCp recipientNewCpRev = newCp(GraphWay::REVERSE, recipientp, donorp, mergeEdgep); NewCp donorNewCpRev = newCp(GraphWay::REVERSE, donorp, recipientp, mergeEdgep); if (mergeEdgep) { // Remove and free the connecting edge. Must do this before // propagating CP's below. m_sb.removeElem(mergeCanp); mergeEdgep->unlinkDelete(); mergeEdgep=NULL; } // This also updates cost and stepCost on recipientp recipientp->moveAllVerticesFrom(donorp); UINFO(9, "recipient = "<id() << ", donor = "<id() << ", mergeEdgep = "< forwardPropagator(m_mtasksp, GraphWay::FORWARD, &cpAccess, m_slowAsserts); PartPropagateCp reversePropagator(m_mtasksp, GraphWay::REVERSE, &cpAccess, m_slowAsserts); recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp); if (recipientNewCpFwd.propagate) { forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); } recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp); if (recipientNewCpRev.propagate) { reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); } if (donorNewCpFwd.propagate) { forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); } if (donorNewCpRev.propagate) { reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); } forwardPropagator.go(); reversePropagator.go(); // Remove all SiblingMCs that include donorp. This Includes the one // we're merging, if we're merging a SiblingMC. removeSiblingMCsWith(donorp); // Remove all SiblingMCs that include recipientp also, so we can't // get huge numbers of SiblingMCs. We'll recreate them below, up // to a bounded number. removeSiblingMCsWith(recipientp); // Merge all edges partMergeEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); // Delete the donorp mtask from the graph donorp->unlinkDelete(m_mtasksp); donorp = NULL; m_mergesSinceRescore++; // Do an expensive check, confirm we haven't botched the CP // updates. if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp); // Finally, make new sibling pairs as needed: // - prereqs and postreqs of recipientp // - prereqs of recipientp's postreqs // - postreqs of recipientp's prereqs // Note that this depends on the updated critical paths (above). siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true); siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true); unsigned edges = 0; for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) { LogicMTask* postreqp = dynamic_cast(edgep->top()); siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false); edges++; if (edges > PART_SIBLING_EDGE_LIMIT) break; } edges = 0; for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) { LogicMTask* prereqp = dynamic_cast(edgep->fromp()); siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false); edges++; if (edges > PART_SIBLING_EDGE_LIMIT) break; } } void doRescore() { // During rescore, we know that graph isn't changing, so allow // the critPathCost*Without() routines to cache some data in // each LogicMTask. This is just an optimization, things should // behave identically without the caching (just slower) m_sb.rescore(); UINFO(6, "Did rescore. Merges since previous = " << m_mergesSinceRescore << endl); m_mergesSinceRescore = 0; m_scoreLimitBeforeRescore = 0xffffffff; } static uint32_t mergeCandidateScore(const MergeCandidate* pairp) { const MTaskEdge* edgep = dynamic_cast(pairp); if (edgep) { // The '1 +' favors merging a SiblingMC over an otherwise- // equal-scoring MTaskEdge. The comment on selfTest() talks // about why. return 1 + edgeScore(edgep); } const SiblingMC* sibsp = dynamic_cast(pairp); if (sibsp) { return siblingScore(sibsp); } v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore"); return 0; } static uint32_t siblingScore(const SiblingMC* sibsp) { LogicMTask* ap = sibsp->ap(); LogicMTask* bp = sibsp->bp(); uint32_t mergedCpCostFwd = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); uint32_t mergedCpCostRev = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); } static uint32_t edgeScore(const V3GraphEdge* edgep) { // Score this edge. Lower is better. The score is the new local CP // length if we merge these mtasks. ("Local" means the longest // critical path running through the merged node.) LogicMTask* top = dynamic_cast(edgep->top()); LogicMTask* fromp = dynamic_cast(edgep->fromp()); uint32_t mergedCpCostFwd = std::max (fromp->critPathCost(GraphWay::FORWARD), top->critPathCostWithout(GraphWay::FORWARD, edgep)); uint32_t mergedCpCostRev = std::max (fromp->critPathCostWithout(GraphWay::REVERSE, edgep), top->critPathCost(GraphWay::REVERSE)); return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost()); } void makeSiblingMC(LogicMTask* ap, LogicMTask *bp) { SiblingMC newSibs(ap, bp); std::pair insertResult = m_pairs.insert(newSibs); if (insertResult.second) { const SiblingMC* newSibsp = &(*insertResult.first); m_mtask2sibs[ap].insert(newSibsp); m_mtask2sibs[bp].insert(newSibsp); m_sb.addElem(newSibsp); } else if (m_slowAsserts) { // It's fine if we already have this SiblingMC, we may have // created it earlier. Just confirm that we have associated data. if (m_mtask2sibs.find(ap) == m_mtask2sibs.end()) { ap->v3fatalSrc("Sibling not found"); } if (m_mtask2sibs.find(bp) == m_mtask2sibs.end()) { bp->v3fatalSrc("Sibling not found"); } bool found = false; for (SibpSet::iterator it = m_mtask2sibs[ap].begin(); it != m_mtask2sibs[ap].end(); ++it) { const SiblingMC* sibsp = *it; if (!sibsp->removedFromSb() && !m_sb.contains(sibsp)) { ap->v3fatalSrc("One sibling must be the one we collided with"); } if ( (sibsp->ap() == ap && sibsp->bp() == bp) || (sibsp->bp() == ap && sibsp->ap() == bp)) found = true; } if (!found) ap->v3fatalSrc("Sibling not found"); } }; static const GraphWay* s_shortestWaywardCpInclusiveWay; static int shortestWaywardCpInclusive(const void* vap, const void* vbp) { const GraphWay* wp = s_shortestWaywardCpInclusiveWay; const LogicMTask* ap = *reinterpret_cast(vap); const LogicMTask* bp = *reinterpret_cast(vbp); uint32_t aCp = ap->critPathCost(*wp) + ap->stepCost(); uint32_t bCp = bp->critPathCost(*wp) + bp->stepCost(); if (aCp < bCp) { return -1; } if (aCp > bCp) { return 1; } if (ap->id() < bp->id()) { return -1; } if (ap->id() > bp->id()) { return 1; } return 0; } void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp, bool exhaustive) { std::vector shortestPrereqs; for (V3GraphEdge* edgep = mtaskp->beginp(way); edgep; edgep = edgep->nextp(way)) { LogicMTask* prereqp = dynamic_cast(edgep->furtherp(way)); shortestPrereqs.push_back(prereqp); // Prevent nodes with huge numbers of edges from massively // slowing down the partitioner: if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break; } if (shortestPrereqs.empty()) return; // qsort_r would be nice here, but it isn't portable s_shortestWaywardCpInclusiveWay = &way; qsort(&shortestPrereqs[0], shortestPrereqs.size(), sizeof(LogicMTask*), &shortestWaywardCpInclusive); // Don't make all NxN/2 possible pairs of prereqs, that's a lot // to cart around. Just make a few pairs. std::vector::iterator it = shortestPrereqs.begin(); for (unsigned i = 0; exhaustive || (i < 3); ++i) { if (it == shortestPrereqs.end()) break; LogicMTask* ap = *(it++); if (it == shortestPrereqs.end()) break; LogicMTask* bp = *(it++); makeSiblingMC(ap, bp); } } // SELF TESTS // This is a performance test, its intent is to demonstrate that the // partitioner doesn't run on this chain in N^2 time or worse. Overall // runtime should be N*log(N) for a chain-shaped graph. // static void selfTestChain() { vluint64_t usecsSmall = partitionChainUsecs(5); vluint64_t usecsLarge = partitionChainUsecs(500); // Large input is 50x bigger than small input. // Its runtime should be about 10x longer -- not about 2500x longer // or worse which would suggest N^2 scaling or worse. UASSERT(usecsLarge < (usecsSmall * 1500), "selfTestChain() took longer than expected. Small input runtime = " <setCost(1); unsigned i; for (i=0; i<50; ++i) { LogicMTask* mtp = new LogicMTask(&mtasks, NULL); mtp->setCost(1); // Edge from every input -> center new MTaskEdge(&mtasks, mtp, center, 1); } for (i=0; i<50; ++i) { LogicMTask* mtp = new LogicMTask(&mtasks, NULL); mtp->setCost(1); // Edge from center -> every output new MTaskEdge(&mtasks, center, mtp, 1); } partInitCriticalPaths(&mtasks); PartContraction(&mtasks, 20, true).go(); PartParallelismEst check(&mtasks); check.traverse(); // Checking exact values here is maybe overly precise. What we're // mostly looking for is a healthy reduction in the number of // mtasks. if (debug()>=5) { UINFO(0, "X self test stats:\n"); check.debugReport(); } UASSERT_SELFTEST(uint32_t, check.longestCritPathCost(), 19); UASSERT_SELFTEST(uint32_t, check.totalGraphCost(), 101); UASSERT_SELFTEST(uint32_t, check.vertexCount(), 14); UASSERT_SELFTEST(uint32_t, check.edgeCount(), 13); } public: static void selfTest() { selfTestX(); selfTestChain(); } private: VL_DEBUG_FUNC; // Declare debug() VL_UNCOPYABLE(PartContraction); }; const GraphWay* PartContraction::s_shortestWaywardCpInclusiveWay = NULL; //###################################################################### // DpiImportCallVisitor // Scan node, indicate whether it contains a call to a DPI imported // routine. class DpiImportCallVisitor : public AstNVisitor { private: bool m_hasDpiHazard; // Found a DPI import call. bool m_tracingCall; // Iterating into a CCall to a CFunc // METHODS VL_DEBUG_FUNC; virtual void visit(AstCFunc* nodep) { if (!m_tracingCall) return; m_tracingCall = false; if (nodep->dpiImportWrapper()) { if (nodep->pure() ? !v3Global.opt.threadsDpiPure() : !v3Global.opt.threadsDpiUnpure()) { m_hasDpiHazard = true; } } iterateChildren(nodep); } virtual void visit(AstCCall* nodep) { iterateChildren(nodep); // Enter the function and trace it m_tracingCall = true; iterate(nodep->funcp()); } virtual void visit(AstNode* nodep) { iterateChildren(nodep); } public: // CONSTUCTORS explicit DpiImportCallVisitor(AstNode* nodep) : m_hasDpiHazard(false) , m_tracingCall(false) { iterate(nodep); } bool hasDpiHazard() const { return m_hasDpiHazard; } virtual ~DpiImportCallVisitor() {} private: VL_UNCOPYABLE(DpiImportCallVisitor); }; //###################################################################### // PartFixDataHazards // Fix data hazards in the partition graph. // // The fine-grained graph from V3Order may contain data hazards which are // not a problem for serial mode, but which would be a problem in parallel // mode. // // There are basically two classes: unordered pairs of writes, and // unordered write-read pairs. We fix both here, with a combination of // MTask-merges and new edges to ensure no such unordered pairs remain. // // ABOUT UNORDERED WRITE-WRITE PAIRS // // The V3Order dependency graph treats these as unordered events: // // a) sig[15:8] = stuff; // ... // b) sig[7:0] = other_stuff; // // Seems OK right? They are writes to disjoint bits of the same // signal. They can run in either order, in serial mode, and the result // will be the same. // // The resulting C code for each of this isn't a pure write, it's // actually an R-M-W sequence: // // a) sig = (sig & 0xff) | (0xff00 & (stuff << 8)); // ... // b) sig = (sig & 0xff00) | (0xff & other_stuff); // // In serial mode, order doesn't matter so long as these run serially. // In parallel mode, we must serialize these RMW's to avoid a race. // // We don't actually check here if each write would involve an R-M-W, we // just assume that it would. If this routine ever causes a drastic // increase in critical path, it could be optimized to make a better // prediction (with all the risk that word implies!) about whether a // given write is likely to turn into an R-M-W. // // ABOUT UNORDERED WRITE-READ PAIRS // // If we don't put unordered write-read pairs into some order at verilation // time, we risk a runtime race. // // How do such unordered writer/reader pairs happen? Here's a partial list // of scenarios: // // Case 1: Circular logic // // If the design has circular logic, V3Order has by now generated some // dependency cycles, and also cut some of the edges to make it // acyclic. // // For serial mode, that was fine. We can break logic circles at an // arbitrary point. At runtime, we'll repeat the _eval() until no // changes are detected, which papers over the discarded dependency. // // For parallel mode, this situation can lead to unordered reads and // writes of the same variable, causing a data race. For example if the // original code is this: // // assign b = b | a << 2; // assign out = b; // // ... there's originally a dependency edge which records that 'b' // depends on the first assign. V3Order may cut this edge, making the // statements unordered. In serial mode that's fine, they can run in // either order. In parallel mode it's a reader/writer race. // // Case 2: Race Condition in Verilog Sources // // If the input has races, eg. blocking assignments in always blocks // that share variables, the graph at this point will contain unordered // writes and reads (or unordered write-write pairs) reflecting that. // // Case 3: Interesting V3Order Behavior // // There's code in V3Order that explicitly avoids making a dependency // edge from a clock-gater signal to the logic node that produces the // clock signal. This leads to unordered reader/writer pairs in // parallel mode. // class PartFixDataHazards { private: // TYPES typedef std::set LogicMTaskSet; typedef std::map TasksByRank; typedef std::set OvvSet; typedef vl_unordered_map Olv2MTaskMap; // MEMBERS V3Graph* m_mtasksp; // Mtask graph Olv2MTaskMap m_olv2mtask; // Map OrderLogicVertex to LogicMTask who wraps it unsigned m_mergesDone; // Number of MTasks merged. For stats only. public: // CONSTRUCTORs explicit PartFixDataHazards(V3Graph* mtasksp) : m_mtasksp(mtasksp), m_mergesDone(0) {} // METHODS private: void findAdjacentTasks(OvvSet::iterator ovvIt, TasksByRank* tasksByRankp) { // Find all writer tasks for this variable, group by rank. for (V3GraphEdge* edgep = (*ovvIt)->inBeginp(); edgep; edgep = edgep->inNextp()) { OrderLogicVertex* logicp = dynamic_cast(edgep->fromp()); if (!logicp) continue; if (logicp->domainp()->hasInitial() || logicp->domainp()->hasSettle()) continue; LogicMTask* writerMtaskp = m_olv2mtask.at(logicp); (*tasksByRankp)[writerMtaskp->rank()].insert(writerMtaskp); } // Find all reader tasks for this variable, group by rank. for (V3GraphEdge* edgep = (*ovvIt)->outBeginp(); edgep; edgep = edgep->outNextp()) { OrderLogicVertex* logicp = dynamic_cast(edgep->fromp()); if (!logicp) continue; if (logicp->domainp()->hasInitial() || logicp->domainp()->hasSettle()) continue; LogicMTask* readerMtaskp = m_olv2mtask.at(logicp); (*tasksByRankp)[readerMtaskp->rank()].insert(readerMtaskp); } } void mergeSameRankTasks(TasksByRank* tasksByRankp) { LogicMTask* lastMergedp = NULL; for (TasksByRank::iterator rankIt = tasksByRankp->begin(); rankIt != tasksByRankp->end(); ++rankIt) { // Find the largest node at this rank, merge into it. (If we // happen to find a huge node, this saves time in // partMergeEdgesFrom() versus merging into an arbitrary node.) LogicMTask* mergedp = NULL; for (LogicMTaskSet::iterator it = rankIt->second.begin(); it != rankIt->second.end(); ++it) { LogicMTask* mtaskp = *it; if (mergedp) { if (mergedp->cost() < mtaskp->cost()) { mergedp = mtaskp; } } else { mergedp = mtaskp; } } rankIt->second.erase(mergedp); while (!rankIt->second.empty()) { LogicMTaskSet::iterator begin = rankIt->second.begin(); LogicMTask* donorp = *begin; if (donorp == mergedp) donorp->v3fatalSrc("Donor can't be merged edge"); rankIt->second.erase(begin); // Merge donorp into mergedp. // Fix up the map, so donor's OLVs map to mergedp for (LogicMTask::VxList::const_iterator tmvit = donorp->vertexListp()->begin(); tmvit != donorp->vertexListp()->end(); ++tmvit) { MTaskMoveVertex* tmvp = *tmvit; OrderLogicVertex* logicp = tmvp->logicp(); if (logicp) m_olv2mtask[logicp] = mergedp; } // Move all vertices from donorp to mergedp mergedp->moveAllVerticesFrom(donorp); // Move edges from donorp to recipientp partMergeEdgesFrom(m_mtasksp, mergedp, donorp, NULL); // Remove donorp from the graph donorp->unlinkDelete(m_mtasksp); VL_DANGLING(donorp); m_mergesDone++; } if (lastMergedp) { if (lastMergedp->rank() >= mergedp->rank()) { mergedp->v3fatalSrc("Merging must be on lower rank"); } if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) { new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1); } } lastMergedp = mergedp; } } bool hasDpiHazard(LogicMTask* mtaskp) { for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); it != mtaskp->vertexListp()->end(); ++it) { if (!(*it)->logicp()) continue; AstNode* nodep = (*it)->logicp()->nodep(); // NOTE: We don't handle DPI exports. If testbench code calls a // DPI-exported function at any time during eval() we may have // a data hazard. (Likewise in non-threaded mode if an export // messes with an ordered variable we're broken.) // Find all calls to DPI-imported functions, we can put those // into a serial order at least. That should solve the most // likely DPI-related data hazards. if (DpiImportCallVisitor(nodep).hasDpiHazard()) { return true; } } return false; } public: void go() { vluint64_t startUsecs = 0; if (debug() >= 3) startUsecs = V3Os::timeUsecs(); // Build an OLV->mtask map and a set of OVVs OrderByPtrId ovvOrder; OvvSet ovvSet(ovvOrder); // OVV's which wrap systemC vars will be handled slightly specially OvvSet ovvSetSystemC(ovvOrder); for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { LogicMTask* mtaskp = dynamic_cast(vxp); // Should be only one MTaskMoveVertex in each mtask at this // stage, but whatever, write it as a loop: for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); it != mtaskp->vertexListp()->end(); ++it) { MTaskMoveVertex* tmvp = *it; if (OrderLogicVertex* logicp = tmvp->logicp()) { m_olv2mtask[logicp] = mtaskp; // Look at downstream vars. for (V3GraphEdge *edgep = logicp->outBeginp(); edgep; edgep = edgep->outNextp()) { // Only consider OrderVarStdVertex which reflects // an actual lvalue assignment; the others do not. OrderVarStdVertex* ovvp = dynamic_cast(edgep->top()); if (!ovvp) continue; if (ovvp->varScp()->varp()->isSc()) { ovvSetSystemC.insert(ovvp); } else { ovvSet.insert(ovvp); } } } } } // Rank the graph. // DGS is faster than V3GraphAlg's recursive rank, in the worst // cases where the recursive rank must pass through the same node // many times. (We saw 22s for DGS vs. 500s for recursive rank on // one large design.) { GraphStreamUnordered serialize(m_mtasksp); const V3GraphVertex* vertexp; while ((vertexp = serialize.nextp())) { uint32_t rank = 0; for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) { rank = std::max(edgep->fromp()->rank() + 1, rank); } const_cast(vertexp)->rank(rank); } } // For each OrderVarVertex, look at its writer and reader mtasks. // // If there's a set of writers and readers at the same rank, we // know these are unordered with respect to one another, so merge // those mtasks all together. // // At this point, we have at most one merged mtask per rank (for a // given OVV.) Create edges across these remaining mtasks to ensure // they run in serial order (going along with the existing ranks.) // // NOTE: we don't update the CP's stored in the LogicMTasks to // reflect the changes we make to the graph. That's OK, as we // haven't yet initialized CPs when we call this routine. for (OvvSet::iterator ovvit = ovvSet.begin(); ovvit != ovvSet.end(); ++ovvit) { // Build a set of mtasks, per rank, which access this var. // Within a rank, sort by MTaskID to avoid nondeterminism. TasksByRank tasksByRank; // Find all reader and writer tasks for this variable, add to // tasksByRank. findAdjacentTasks(ovvit, &tasksByRank); // Merge all writer and reader tasks from same rank together. // // NOTE: Strictly speaking, we don't need to merge all the // readers together. That may lead to extra serialization. The // least amount of ordering we could impose here would be to // merge all writers at a given rank together; then make edges // from the merged writer node to each reader node at the same // rank; and then from each reader node to the merged writer at // the next rank. // // Whereas, merging all readers and writers at the same rank // together is "the simplest thing that could possibly work" // and it seems to. It also creates fairly few edges. We don't // want to create tons of edges here, doing so is not nice to // the main edge contraction pass. mergeSameRankTasks(&tasksByRank); } // Handle SystemC vars just a little differently. Instead of // treating each var as an independent entity, and serializing // writes to that one var, we treat ALL systemC vars as a single // entity and serialize writes (and, conservatively, reads) across // all of them. // // Reasoning: writing a systemC var actually turns into a call to a // var.write() method, which under the hood is accessing some data // structure that's shared by many SC vars. It's not thread safe. // // Hopefully we only have a few SC vars -- top level ports, probably. { TasksByRank tasksByRank; for (OvvSet::iterator ovvit = ovvSetSystemC.begin(); ovvit != ovvSetSystemC.end(); ++ovvit) { findAdjacentTasks(ovvit, &tasksByRank); } mergeSameRankTasks(&tasksByRank); } // Handle nodes containing DPI calls, we want to serialize those // by default unless user gave --threads-dpi-concurrent. // Same basic strategy as above to serialize access to SC vars. if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) { TasksByRank tasksByRank; for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { LogicMTask* mtaskp = dynamic_cast(vxp); if (hasDpiHazard(mtaskp)) { tasksByRank[vxp->rank()].insert(mtaskp); } } mergeSameRankTasks(&tasksByRank); } UINFO(4, "PartFixDataHazards() merged "<id() < bp->id(); } }; // MEMBERS V3Graph* m_mtasksp; // Mtask graph uint32_t m_nThreads; // Number of threads uint32_t m_sandbagNumerator; // Numerator padding for est runtime uint32_t m_sandbagDenom; // Denomerator padding for est runtime typedef vl_unordered_map MTaskStateMap; MTaskStateMap m_mtaskState; // State for each mtask. MTaskCmp m_mtaskCmp; // Comparison functor typedef std::set ReadyMTasks; ReadyMTasks m_ready; // MTasks ready to be assigned next; all their // // dependencies are already assigned. typedef std::vector MTaskVec; MTaskVec m_prevMTask; // Previous mtask scheduled to each thread. std::vector m_busyUntil; // Time each thread is occupied until public: // CONSTRUCTORS PartPackMTasks(V3Graph* mtasksp, uint32_t nThreads = v3Global.opt.threads(), unsigned sandbagNumerator = 30, unsigned sandbagDenom = 100) : m_mtasksp(mtasksp) , m_nThreads(nThreads) , m_sandbagNumerator(sandbagNumerator) , m_sandbagDenom(sandbagDenom) , m_ready(m_mtaskCmp) {} ~PartPackMTasks() {} // METHOS uint32_t completionTime(const ExecMTask* mtaskp, uint32_t thread) { const MTaskState& state = m_mtaskState[mtaskp]; UASSERT(mtaskp->thread() != 0xffffffff, "Mtask should have assigned thread"); if (thread == mtaskp->thread()) { // No overhead on native thread return state.completionTime; } // Add some padding to the estimated runtime when looking from // another thread uint32_t sandbaggedEndTime = state.completionTime + (m_sandbagNumerator * mtaskp->cost()) / m_sandbagDenom; // If task B is packed after task A on thread 0, don't let thread 1 // think that A finishes later than thread 0 thinks that B // finishes, otherwise we get priority inversions and fail the self // test. if (mtaskp->packNextp()) { uint32_t successorEndTime = completionTime(mtaskp->packNextp(), mtaskp->thread()); if ((sandbaggedEndTime >= successorEndTime) && (successorEndTime > 1)) { sandbaggedEndTime = successorEndTime - 1; } } UINFO(6, "Sandbagged end time for "<name() <<" on th "<verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { ExecMTask* mtaskp = dynamic_cast(vxp); if (vxp->inEmpty()) m_ready.insert(mtaskp); } m_prevMTask.clear(); m_prevMTask.resize(m_nThreads); m_busyUntil.clear(); m_busyUntil.resize(m_nThreads); while (!m_ready.empty()) { // For each task in the ready set, compute when it might start // on each thread (in that thread's local time frame.) uint32_t bestTime = 0xffffffff; uint32_t bestTh = 0; ExecMTask* bestMtaskp = NULL; for (uint32_t th = 0; th < m_nThreads; ++th) { for (ReadyMTasks::iterator taskIt = m_ready.begin(); taskIt != m_ready.end(); ++taskIt) { uint32_t timeBegin = m_busyUntil[th]; if (timeBegin > bestTime) { UINFO(6, "th "<inBeginp(); edgep; edgep = edgep->inNextp()) { ExecMTask* priorp = dynamic_cast(edgep->fromp()); uint32_t priorEndTime = completionTime(priorp, th); if (priorEndTime > timeBegin) { timeBegin = priorEndTime; } } UINFO(6, "Task "<name() <<" start at "<priority() > bestMtaskp->priority()))) { bestTime = timeBegin; bestTh = th; bestMtaskp = taskp; } } } UINFO(6, "Will schedule "<name() <<" onto thread "<cost(); setCompletionTime(bestMtaskp, bestEndTime); // Update the ready list size_t erased = m_ready.erase(bestMtaskp); if (erased <= 0) bestMtaskp->v3fatalSrc("Should have erased something?"); for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp; edgeOutp = edgeOutp->outNextp()) { ExecMTask* nextp = dynamic_cast(edgeOutp->top()); UASSERT(nextp->thread() == 0xffffffff, "Tasks after one being assigned should not be assigned yet"); // They also should not be ready yet, since they only now // may become ready if (m_ready.find(nextp) != m_ready.end()) { nextp->v3fatalSrc("Tasks after one being assigned should not be ready"); } bool isReady = true; for (V3GraphEdge* edgeInp = nextp->inBeginp(); edgeInp; edgeInp = edgeInp->inNextp()) { ExecMTask* priorp = dynamic_cast(edgeInp->fromp()); if (priorp == bestMtaskp) continue; if (priorp->thread() == 0xffffffff) { // This prior is not assigned yet isReady = false; } } if (isReady) { m_ready.insert(nextp); UINFO(6, "Inserted "<name()<<" into ready\n"); } } // Update the ExecMTask itself if (m_prevMTask[bestTh]) { m_prevMTask[bestTh]->packNextp(bestMtaskp); UINFO(6, "Packing "<name() <<" after "<name()<name()<<" as thread root\n"); bestMtaskp->threadRoot(true); } bestMtaskp->thread(bestTh); // Update the thread state m_prevMTask[bestTh] = bestMtaskp; m_busyUntil[bestTh] = bestEndTime; } } // SELF TEST static void selfTest() { V3Graph graph; ExecMTask* t0 = new ExecMTask(&graph, NULL, 0); t0->cost(1000); t0->priority(1100); ExecMTask* t1 = new ExecMTask(&graph, NULL, 1); t1->cost(100); t1->priority(100); ExecMTask* t2 = new ExecMTask(&graph, NULL, 2); t2->cost(100); t2->priority(100); new V3GraphEdge(&graph, t0, t1, 1); new V3GraphEdge(&graph, t0, t2, 1); PartPackMTasks packer(&graph, 2, // Threads 3, // Sandbag numerator 10); // Sandbag denom packer.go(); UASSERT_SELFTEST(bool, t0->threadRoot(), true); UASSERT_SELFTEST(uint32_t, t0->thread(), 0); UASSERT_SELFTEST(const void*, t0->packNextp(), t1); UASSERT_SELFTEST(uint32_t, t1->thread(), 0); UASSERT_SELFTEST(bool, t1->threadRoot(), false); UASSERT_SELFTEST(const void*, t1->packNextp(), NULL); UASSERT_SELFTEST(uint32_t, t2->thread(), 1); UASSERT_SELFTEST(bool, t2->threadRoot(), true); UASSERT_SELFTEST(const void*, t2->packNextp(), NULL); // On its native thread, we see the actual end time for t0: UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 0), 1000); // On the other thread, we see a sandbagged end time which does not // exceed the t1 end time: UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 1), 1099); // Actual end time on native thread: UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 0), 1100); // Sandbagged end time seen on thread 1. Note it does not compound // with t0's sandbagged time; compounding caused trouble in // practice. UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 1), 1130); UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 0), 1229); UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 1), 1199); } private: VL_DEBUG_FUNC; // Declare debug() VL_UNCOPYABLE(PartPackMTasks); }; //###################################################################### // V3Partition implementation void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) { if (!debug()) return; UINFO(4, "\n"); UINFO(4, " Stats for "<verticesBeginp(); mtaskp; mtaskp = mtaskp->verticesNextp()) { ++mtaskCount; uint32_t mtaskCost = dynamic_cast(mtaskp)->cost(); totalCost += mtaskCost; unsigned log2Cost = 0; while (mtaskCost >>= 1) ++log2Cost; UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats"); ++mtaskCostHist[log2Cost]; } UINFO(4, " Total mtask cost = "<= 4) graphp->dumpDotFilePrefixedAlways(filePrefix); } // Look only at the cost of each mtask, neglect communication cost. // This will show us how much parallelism we expect, assuming cache-miss // costs are minor and the cost of running logic is the dominant cost. PartParallelismEst vertexParEst(graphp); vertexParEst.traverse(); vertexParEst.statsReport(stage); if (debug()>=4) { UINFO(0, "\n"); UINFO(0, " Parallelism estimate for based on mtask costs:\n"); vertexParEst.debugReport(); } } // Print a hash of the shape of graphp. If you are battling // nondeterminism, this can help to pinpoint where in the pipeline it's // creeping in. void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) { // Disabled when there are no nondeterminism issues in flight. if (!v3Global.opt.debugNondeterminism()) return; vl_unordered_map vx2Id; unsigned id = 0; for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { vx2Id[vxp] = id++; } unsigned hash = 0; for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { for (const V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep= edgep->outNextp()) { const V3GraphVertex* top = edgep->top(); hash = vx2Id[top] + 31u * hash; // The K&R hash function } } UINFO(0, "Hash of shape (not contents) of "<(outp->top()); UASSERT(top, "MoveVertex not associated to mtask"); Vx2MTaskMap::const_iterator it = vx2mtaskp->find(top); UASSERT(it != vx2mtaskp->end(), "MTask map can't find id"); LogicMTask* otherMTaskp = it->second; UASSERT(otherMTaskp, "NULL other Mtask"); if (otherMTaskp == mtaskp) mtaskp->v3fatalSrc("Would create a cycle edge"); // Don't create redundant edges. if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) { continue; } new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1); } } } } void V3Partition::go(V3Graph* mtasksp) { // Called by V3Order hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps"); // Create the first MTasks. Initially, each MTask just wraps one // MTaskMoveVertex. Over time, we'll merge MTasks together and // eventually each MTask will wrap a large number of MTaskMoveVertices // (and the logic nodes therein.) uint32_t totalGraphCost = 0; { // The V3InstrCount within LogicMTask will set user5 on each AST // node, to assert that we never count any node twice. AstUser5InUse inUser5; Vx2MTaskMap vx2mtask; for (V3GraphVertex* vxp = m_fineDepsGraphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { MTaskMoveVertex* mtmvVxp = dynamic_cast(vxp); if (!mtmvVxp) vxp->v3fatalSrc("Every vertex here should be an MTaskMoveVertex"); LogicMTask* mtaskp = new LogicMTask(mtasksp, mtmvVxp); vx2mtask[mtmvVxp] = mtaskp; totalGraphCost += mtaskp->cost(); } // Create the mtask->mtask dep edges based on vertex deps setupMTaskDeps(mtasksp, &vx2mtask); } V3Partition::debugMTaskGraphStats(mtasksp, "initial"); // For debug: print out the longest critical path. This allows us to // verify that the costs look reasonable, that we aren't combining // nodes that should probably be split, etc. if (v3Global.opt.dumpTreeLevel(__FILE__) >= 3) { LogicMTask::dumpCpFilePrefixed(mtasksp, "cp"); } // Merge nodes that could present data hazards; see comment within. { PartFixDataHazards(mtasksp).go(); V3Partition::debugMTaskGraphStats(mtasksp, "hazards"); hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()"); } // Setup the critical path into and out of each node. partInitCriticalPaths(mtasksp); hashGraphDebug(mtasksp, "after partInitCriticalPaths()"); // Order the graph. We know it's already ranked from fixDataHazards() // so we don't need to rank it again. // // On at least some models, ordering the graph here seems to help // performance. (Why? Is it just triggering noise in a lucky direction? // Is it just as likely to harm results?) // // More diversity of models that can build with --threads will // eventually tell us. For now keep the order() so we don't forget // about it, in case it actually helps. TODO: get more data and maybe // remove this later if it doesn't really help. mtasksp->orderPreRanked(); int targetParFactor = v3Global.opt.threads(); if (targetParFactor < 2) { v3fatalSrc("We should not reach V3Partition when --threads <= 1"); } // Set cpLimit to roughly totalGraphCost / nThreads // // Actually set it a bit lower, by a hardcoded fudge factor. This // results in more smaller mtasks, which helps reduce fragmentation // when scheduling them. unsigned fudgeNumerator = 3; unsigned fudgeDenominator = 5; uint32_t cpLimit = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator)); UINFO(4, "V3Partition set cpLimit = "<removeTransitiveEdges(); V3Partition::debugMTaskGraphStats(mtasksp, "transitive1"); } // Reassign MTask IDs onto smaller numbers, which should be more stable // across small logic changes. Keep MTask IDs in the same relative // order though, otherwise we break CmpLogicMTask for still-existing // EdgeSet's that haven't destructed yet. { typedef std::set SortedMTaskSet; SortedMTaskSet sorted; for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { LogicMTask* mtaskp = dynamic_cast(itp); sorted.insert(mtaskp); } uint32_t nextId = 1; for (SortedMTaskSet::iterator it = sorted.begin(); it != sorted.end(); ++it) { // We shouldn't perturb the sort order of the set, despite // changing the IDs, they should all just remain in the same // relative order. Confirm that: UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here"); UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n"); (*it)->id(nextId); nextId++; } } // Set color to indicate an mtaskId on every underlying MTaskMoveVertex. for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { LogicMTask* mtaskp = dynamic_cast(itp); for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); it != mtaskp->vertexListp()->end(); ++it) { MTaskMoveVertex* mvertexp = *it; mvertexp->color(mtaskp->id()); } } } void V3Partition::finalizeCosts(V3Graph* execMTaskGraphp) { GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE); while (const V3GraphVertex* vxp = ser.nextp()) { ExecMTask* mtp = dynamic_cast(const_cast(vxp)); uint32_t costCount = V3InstrCount::count(mtp->bodyp(), false); mtp->cost(costCount); mtp->priority(costCount); // "Priority" is the critical path from the start of the mtask, to // the end of the graph reachable from this mtask. Given the // choice among several ready mtasks, we'll want to start the // highest priority one first, so we're always working on the "long // pole" for (V3GraphEdge* edgep = mtp->outBeginp(); edgep; edgep = edgep->outNextp()) { ExecMTask* followp = dynamic_cast(edgep->top()); if ((followp->priority() + mtp->cost()) > mtp->priority()) { mtp->priority(followp->priority() + mtp->cost()); } } } // Some MTasks may now have zero cost, eliminate those. // (It's common for tasks to shrink to nothing when V3LifePost // removes dly assignments.) for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; ) { ExecMTask* mtp = dynamic_cast(vxp); vxp = vxp->verticesNextp(); // Advance before delete // Don't rely on checking mtp->cost() == 0 to detect an empty task. // Our cost-estimating logic is just an estimate. Instead, check // the MTaskBody to see if it's empty. That's the source of truth. AstMTaskBody* bodyp = mtp->bodyp(); if (!bodyp->stmtsp()) { // Kill this empty mtask UINFO(6, "Removing zero-cost "<name()<inBeginp(); inp; inp = inp->inNextp()) { for (V3GraphEdge* outp = mtp->outBeginp(); outp; outp = outp->outNextp()) { new V3GraphEdge(execMTaskGraphp, inp->fromp(), outp->top(), 1); } } mtp->unlinkDelete(execMTaskGraphp); VL_DANGLING(mtp); // Also remove and delete the AstMTaskBody, otherwise it would // keep a dangling pointer to the ExecMTask. bodyp->unlinkFrBack()->deleteTree(); VL_DANGLING(bodyp); } } // Removing tasks may cause edges that were formerly non-transitive to // become transitive. Also we just created new edges around the removed // tasks, which could be transitive. Prune out all transitive edges. { execMTaskGraphp->removeTransitiveEdges(); V3Partition::debugMTaskGraphStats(execMTaskGraphp, "transitive2"); } // Record summary stats for final m_tasks graph. // (More verbose stats are available with --debugi-V3Partition >= 3.) PartParallelismEst parEst(execMTaskGraphp); parEst.traverse(); parEst.statsReport("final"); if (debug() >= 3) { UINFO(0," Final mtask parallelism report:\n"); parEst.debugReport(); } } void V3Partition::finalize() { // Called by Verilator top stage AstExecGraph* execGraphp = v3Global.rootp()->execGraphp(); UASSERT(execGraphp, "Couldn't find AstExecGraph singleton."); // Back in V3Order, we partitioned mtasks using provisional cost // estimates. However, V3Order precedes some optimizations (notably // V3LifePost) that can change the cost of logic within each mtask. // Now that logic is final, recompute the cost and priority of each // ExecMTask. finalizeCosts(execGraphp->mutableDepGraphp()); // "Pack" the mtasks: statically associate each mtask with a thread, // and determine the order in which each thread will runs its mtasks. PartPackMTasks(execGraphp->mutableDepGraphp()).go(); } void V3Partition::selfTest() { PartPropagateCpSelfTest::selfTest(); PartPackMTasks::selfTest(); PartContraction::selfTest(); }