2018-07-23 02:54:28 +02:00
|
|
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
|
|
|
|
//*************************************************************************
|
|
|
|
|
// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
|
|
|
|
|
//
|
2019-11-08 04:33:59 +01:00
|
|
|
// Code available from: https://verilator.org
|
2018-07-23 02:54:28 +02:00
|
|
|
//
|
|
|
|
|
//*************************************************************************
|
|
|
|
|
//
|
2021-01-01 16:29:54 +01:00
|
|
|
// Copyright 2003-2021 by Wilson Snyder. This program is free software; you
|
2020-03-21 16:24:24 +01:00
|
|
|
// can redistribute it and/or modify it under the terms of either the GNU
|
2018-07-23 02:54:28 +02:00
|
|
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
|
|
|
|
// Version 2.0.
|
2020-03-21 16:24:24 +01:00
|
|
|
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
2018-07-23 02:54:28 +02:00
|
|
|
//
|
|
|
|
|
//*************************************************************************
|
2019-10-05 02:17:11 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
#include "config_build.h"
|
|
|
|
|
#include "verilatedos.h"
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
#include "V3EmitCBase.h"
|
2018-07-23 02:54:28 +02:00
|
|
|
#include "V3Os.h"
|
|
|
|
|
#include "V3File.h"
|
|
|
|
|
#include "V3GraphAlg.h"
|
|
|
|
|
#include "V3GraphStream.h"
|
|
|
|
|
#include "V3InstrCount.h"
|
|
|
|
|
#include "V3Partition.h"
|
|
|
|
|
#include "V3PartitionGraph.h"
|
|
|
|
|
#include "V3Scoreboard.h"
|
|
|
|
|
#include "V3Stats.h"
|
2018-10-14 19:43:24 +02:00
|
|
|
|
|
|
|
|
#include <list>
|
|
|
|
|
#include <memory>
|
2020-08-15 16:03:34 +02:00
|
|
|
#include <unordered_set>
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
class MergeCandidate;
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// Partitioner tunable settings:
|
|
|
|
|
//
|
|
|
|
|
// Before describing these settings, a bit of background:
|
|
|
|
|
//
|
|
|
|
|
// Early during the development of the partitioner, V3Split was failing to
|
|
|
|
|
// split large always blocks (with ~100K assignments) so we had to handle
|
|
|
|
|
// very large vertices with ~100K incoming and outgoing edges.
|
|
|
|
|
//
|
|
|
|
|
// The partitioner attempts to deal with such densely connected
|
|
|
|
|
// graphs. Some of the tuning parameters below reference "huge vertices",
|
|
|
|
|
// that's what they're talking about, vertices with tens of thousands of
|
|
|
|
|
// edges in and out. Whereas most graphs have only tens of edges in and out
|
|
|
|
|
// of most vertices.
|
|
|
|
|
//
|
|
|
|
|
// V3Split has since been fixed to more reliably split large always
|
|
|
|
|
// blocks. It's kind of an open question whether the partitioner must
|
|
|
|
|
// handle huge nodes gracefully. Maybe not! But it still can, given
|
|
|
|
|
// appropriate tuning.
|
|
|
|
|
|
|
|
|
|
// PART_SIBLING_EDGE_LIMIT (integer)
|
|
|
|
|
//
|
|
|
|
|
// Arbitrarily limit the number of edges on a single vertex that will be
|
|
|
|
|
// considered when enumerating siblings, to the given value. This protects
|
|
|
|
|
// the partitioner runtime in the presence of huge vertices.
|
|
|
|
|
//
|
|
|
|
|
// The sibling-merge is less important than the edge merge. (You can
|
|
|
|
|
// totally disable the sibling merge and get halfway decent partitions; you
|
|
|
|
|
// can't disable edge merges, those are fundamental to the process.) So,
|
|
|
|
|
// skipping the enumeration of some siblings on a few vertices does not
|
|
|
|
|
// have a large impact on the result of the partitioner.
|
|
|
|
|
//
|
|
|
|
|
// If your vertices are small, the limit (at 25) approaches a no-op. Hence
|
|
|
|
|
// there's basically no cost to applying this limit even when we don't
|
|
|
|
|
// expect huge vertices.
|
|
|
|
|
//
|
|
|
|
|
// If you don't care about partitioner runtime and you want the most
|
|
|
|
|
// aggressive partition, set the limit very high. If you have huge
|
|
|
|
|
// vertices, leave this as is.
|
2020-08-16 20:19:12 +02:00
|
|
|
constexpr unsigned PART_SIBLING_EDGE_LIMIT = 25;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2020-08-16 20:19:12 +02:00
|
|
|
// PART_STEPPED_COST (defined/undef)
|
2018-07-23 02:54:28 +02:00
|
|
|
//
|
|
|
|
|
// When computing critical path costs, use a step function on the actual
|
|
|
|
|
// underlying vertex cost.
|
|
|
|
|
//
|
|
|
|
|
// If there are huge vertices, when a tiny vertex merges into a huge
|
|
|
|
|
// vertex, we can often avoid increasing the huge vertex's stepped cost.
|
|
|
|
|
// If the stepped cost hasn't increased, and the critical path into the huge
|
|
|
|
|
// vertex hasn't increased, we can avoid propagating a new critical path to
|
|
|
|
|
// vertices past the huge vertex. Since huge vertices tend to have huge lists
|
|
|
|
|
// of children and parents, this can be a substantial savings.
|
|
|
|
|
//
|
|
|
|
|
// Does not seem to reduce the quality of the partitioner's output.
|
|
|
|
|
//
|
|
|
|
|
// If you have huge vertices, leave this 'true', it is the major setting
|
|
|
|
|
// that allows the partitioner to handle such difficult graphs on anything
|
|
|
|
|
// like a human time scale.
|
|
|
|
|
//
|
|
|
|
|
// If you don't have huge vertices, the 'true' value doesn't help much but
|
|
|
|
|
// should cost almost nothing in terms of partitioner quality.
|
|
|
|
|
//
|
|
|
|
|
// If you want the most aggressive possible partition, set it "false" and
|
2019-09-09 13:50:21 +02:00
|
|
|
// be prepared to be disappointed when the improvement in the partition is
|
2018-07-23 02:54:28 +02:00
|
|
|
// negligible / in the noise.
|
|
|
|
|
//
|
|
|
|
|
// Q) Why retain the control, if there is really no downside?
|
|
|
|
|
//
|
|
|
|
|
// A) Cost stepping can lead to corner cases. A developer may wish to
|
|
|
|
|
// disable cost stepping to rule it out as the cause of unexpected
|
|
|
|
|
// behavior.
|
|
|
|
|
#define PART_STEPPED_COST true
|
|
|
|
|
|
|
|
|
|
// Don't produce more than a certain maximum number of MTasks. This helps
|
|
|
|
|
// the TSP variable sort not to blow up (a concern for some of the tests)
|
|
|
|
|
// and we probably don't want a huge number of mtasks in practice anyway
|
|
|
|
|
// (50 to 100 is typical.)
|
|
|
|
|
//
|
|
|
|
|
// If the user doesn't give one with '--threads-max-mtasks', we'll set the
|
|
|
|
|
// maximum # of MTasks to
|
|
|
|
|
// (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD)
|
2020-08-16 20:19:12 +02:00
|
|
|
constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// end tunables.
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// Misc graph and assertion utilities
|
|
|
|
|
|
|
|
|
|
static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) {
|
|
|
|
|
#if PART_STEPPED_COST
|
|
|
|
|
// Cached CP might be a little bigger than actual, due to stepped CPs.
|
|
|
|
|
// Example:
|
|
|
|
|
// Let's say we have a parent with stepped_cost 40 and a grandparent
|
|
|
|
|
// with stepped_cost 27. Our forward-cp is 67. Then our parent and
|
|
|
|
|
// grandparent get merged, the merged node has stepped cost 66. We
|
|
|
|
|
// won't propagate that new CP to children as it hasn't grown. So,
|
|
|
|
|
// children may continue to think that the CP coming through this path
|
|
|
|
|
// is a little higher than it really is; permit that.
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT((((cached * 10) <= (actual * 11)) && (cached * 11) >= (actual * 10)),
|
2018-07-23 02:54:28 +02:00
|
|
|
"Calculation error in scoring (approximate, may need tweak)");
|
|
|
|
|
#else
|
|
|
|
|
UASSERT(cached == actual, "Calculation error in scoring");
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// PartPropagateCp
|
|
|
|
|
|
|
|
|
|
// Propagate increasing critical path (CP) costs through a graph.
|
|
|
|
|
//
|
|
|
|
|
// Usage:
|
|
|
|
|
// * Client increases the cost and/or CP at a node or small set of nodes
|
|
|
|
|
// (often a pair in practice, eg. edge contraction.)
|
|
|
|
|
// * Client instances a PartPropagateCp object
|
|
|
|
|
// * Client calls PartPropagateCp::cpHasIncreased() one or more times.
|
|
|
|
|
// Each call indicates that the inclusive CP of some "seed" vertex
|
|
|
|
|
// has increased to a given value.
|
|
|
|
|
// * NOTE: PartPropagateCp will neither read nor modify the cost
|
|
|
|
|
// or CPs at the seed vertices, it only accesses and modifies
|
|
|
|
|
// vertices wayward from the seeds.
|
|
|
|
|
// * Client calls PartPropagateCp::go(). Internally, this iteratively
|
|
|
|
|
// propagates the new CPs wayward through the graph.
|
|
|
|
|
//
|
|
|
|
|
template <class T_CostAccessor> class PartPropagateCp : GraphAlg<> {
|
|
|
|
|
private:
|
|
|
|
|
// MEMBERS
|
|
|
|
|
GraphWay m_way; // CPs oriented in this direction: either FORWARD
|
|
|
|
|
// // from graph-start to current node, or REVERSE
|
|
|
|
|
// // from graph-end to current node.
|
|
|
|
|
T_CostAccessor* m_accessp; // Access cost and CPs on V3GraphVertex's.
|
2020-08-15 19:11:27 +02:00
|
|
|
vluint64_t m_generation = 0; // Mark each vertex with this number;
|
2018-07-23 02:54:28 +02:00
|
|
|
// // confirm we only process each vertex once.
|
|
|
|
|
bool m_slowAsserts; // Enable nontrivial asserts
|
2021-03-12 23:26:53 +01:00
|
|
|
SortByValueMap<V3GraphVertex*, uint32_t> m_pending; // Pending rescores
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
2020-04-15 13:58:34 +02:00
|
|
|
PartPropagateCp(V3Graph* graphp, GraphWay way, T_CostAccessor* accessp, bool slowAsserts,
|
2018-07-23 02:54:28 +02:00
|
|
|
V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue)
|
2020-08-16 15:55:36 +02:00
|
|
|
: GraphAlg<>{graphp, edgeFuncp}
|
|
|
|
|
, m_way{way}
|
|
|
|
|
, m_accessp{accessp}
|
|
|
|
|
, m_slowAsserts{slowAsserts} {}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// METHODS
|
|
|
|
|
void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) {
|
|
|
|
|
// For *vxp, whose CP-inclusive has just increased to
|
|
|
|
|
// newInclusiveCp, iterate to all wayward nodes, update the edges
|
|
|
|
|
// of each, and add each to m_pending if its overall CP has grown.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = vxp->beginp(m_way); edgep; edgep = edgep->nextp(m_way)) {
|
2018-07-23 02:54:28 +02:00
|
|
|
if (!m_edgeFuncp(edgep)) continue;
|
|
|
|
|
V3GraphVertex* relativep = edgep->furtherp(m_way);
|
|
|
|
|
m_accessp->notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp);
|
|
|
|
|
|
|
|
|
|
if (m_accessp->critPathCost(relativep, m_way) < newInclusiveCp) {
|
|
|
|
|
// relativep's critPathCost() is out of step with its
|
|
|
|
|
// longest !wayward edge. Schedule that to be resolved.
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t newPendingVal
|
|
|
|
|
= newInclusiveCp - m_accessp->critPathCost(relativep, m_way);
|
2018-07-23 02:54:28 +02:00
|
|
|
if (m_pending.has(relativep)) {
|
|
|
|
|
if (newPendingVal > m_pending.at(relativep)) {
|
|
|
|
|
m_pending.set(relativep, newPendingVal);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
m_pending.set(relativep, newPendingVal);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void go() {
|
|
|
|
|
// m_pending maps each pending vertex to the amount that it wayward
|
|
|
|
|
// CP will grow.
|
|
|
|
|
//
|
|
|
|
|
// We can iterate over the pending set in reverse order, always
|
|
|
|
|
// choosing the nodes with the largest pending CP-growth.
|
|
|
|
|
//
|
|
|
|
|
// The intuition is: if the original seed node had its CP grow by
|
|
|
|
|
// 50, the most any wayward node can possibly grow is also 50. So
|
|
|
|
|
// for anything pending to grow by 50, we know we can process it
|
|
|
|
|
// once and we won't have to grow its CP again on the current pass.
|
|
|
|
|
// After we're done with all the grow-by-50s, nothing else will
|
|
|
|
|
// grow by 50 again on the current pass, and we can process the
|
|
|
|
|
// grow-by-49s and we know we'll only have to process each one
|
|
|
|
|
// once. And so on.
|
|
|
|
|
//
|
|
|
|
|
// This generalizes to multiple seed nodes also.
|
|
|
|
|
while (!m_pending.empty()) {
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto it = m_pending.rbegin();
|
2018-07-23 02:54:28 +02:00
|
|
|
V3GraphVertex* updateMep = (*it).key();
|
|
|
|
|
uint32_t cpGrowBy = (*it).value();
|
|
|
|
|
m_pending.erase(it);
|
|
|
|
|
|
|
|
|
|
// For *updateMep, whose critPathCost was out-of-date with respect
|
|
|
|
|
// to its edges, update the critPathCost.
|
|
|
|
|
uint32_t startCp = m_accessp->critPathCost(updateMep, m_way);
|
|
|
|
|
uint32_t newCp = startCp + cpGrowBy;
|
2020-04-15 13:58:34 +02:00
|
|
|
if (m_slowAsserts) m_accessp->checkNewCpVersusEdges(updateMep, m_way, newCp);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
m_accessp->setCritPathCost(updateMep, m_way, newCp);
|
|
|
|
|
cpHasIncreased(updateMep, newCp + m_accessp->cost(updateMep));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_DEBUG_FUNC;
|
|
|
|
|
VL_UNCOPYABLE(PartPropagateCp);
|
|
|
|
|
};
|
|
|
|
|
|
2020-11-19 03:32:16 +01:00
|
|
|
class PartPropagateCpSelfTest final {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
// MEMBERS
|
|
|
|
|
V3Graph m_graph; // A graph
|
|
|
|
|
V3GraphVertex* m_vx[50]; // All vertices within the graph
|
2021-03-13 00:10:45 +01:00
|
|
|
using CpMap = std::unordered_map<V3GraphVertex*, uint32_t>;
|
2018-07-23 02:54:28 +02:00
|
|
|
CpMap m_cp; // Vertex-to-CP map
|
|
|
|
|
CpMap m_seen; // Set of vertices we've seen
|
|
|
|
|
|
|
|
|
|
// CONSTRUCTORS
|
2020-11-17 01:56:16 +01:00
|
|
|
PartPropagateCpSelfTest() = default;
|
|
|
|
|
~PartPropagateCpSelfTest() = default;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// METHODS
|
|
|
|
|
protected:
|
|
|
|
|
friend class PartPropagateCp<PartPropagateCpSelfTest>;
|
2020-04-15 13:58:34 +02:00
|
|
|
void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throughp,
|
|
|
|
|
uint32_t cp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
uint32_t throughCost = critPathCost(throughp, way);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, cp, (1 + throughCost));
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
2020-04-15 13:58:34 +02:00
|
|
|
void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Don't need to check this in the self test; it supports an assert
|
|
|
|
|
// that runs in production code.
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) {
|
2018-07-23 02:54:28 +02:00
|
|
|
m_cp[vxp] = cost;
|
|
|
|
|
// Confirm that we only set each node's CP once. That's an
|
|
|
|
|
// important property of PartPropagateCp which allows it to be far
|
|
|
|
|
// faster than a recursive algorithm on some graphs.
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto it = m_seen.find(vxp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(it == m_seen.end(), vxp, "Set CP on node twice");
|
2018-07-23 02:54:28 +02:00
|
|
|
m_seen[vxp] = cost;
|
|
|
|
|
}
|
|
|
|
|
uint32_t critPathCost(V3GraphVertex* vxp, GraphWay way) const {
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto it = m_cp.find(vxp);
|
2018-07-23 02:54:28 +02:00
|
|
|
if (it != m_cp.end()) return it->second;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2020-11-11 03:40:14 +01:00
|
|
|
static uint32_t cost(const V3GraphVertex*) { return 1; }
|
2018-07-23 02:54:28 +02:00
|
|
|
void partInitCriticalPaths(bool checkOnly) {
|
|
|
|
|
// Set up the FORWARD cp's only. This test only looks in one
|
|
|
|
|
// direction, it assumes REVERSE is symmetrical and would be
|
|
|
|
|
// redundant to test.
|
|
|
|
|
GraphStreamUnordered order(&m_graph);
|
|
|
|
|
while (const V3GraphVertex* cvxp = order.nextp()) {
|
|
|
|
|
V3GraphVertex* vxp = const_cast<V3GraphVertex*>(cvxp);
|
|
|
|
|
uint32_t cpCost = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = vxp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
V3GraphVertex* parentp = edgep->fromp();
|
2020-04-15 13:58:34 +02:00
|
|
|
cpCost = std::max(cpCost, critPathCost(parentp, GraphWay::FORWARD) + 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
if (checkOnly) {
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT_SELFTEST(uint32_t, cpCost, critPathCost(vxp, GraphWay::FORWARD));
|
2018-07-23 02:54:28 +02:00
|
|
|
} else {
|
|
|
|
|
setCritPathCost(vxp, GraphWay::FORWARD, cpCost);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
void go() {
|
|
|
|
|
// Generate a pseudo-random graph
|
2020-12-03 01:20:03 +01:00
|
|
|
std::array<vluint64_t, 2> rngState
|
|
|
|
|
= {{0x12345678ULL, 0x9abcdef0ULL}}; // GCC 3.8.0 wants {{}}
|
2018-07-23 02:54:28 +02:00
|
|
|
// Create 50 vertices
|
2020-11-11 04:10:38 +01:00
|
|
|
for (auto& i : m_vx) i = new V3GraphVertex(&m_graph);
|
2018-07-23 02:54:28 +02:00
|
|
|
// Create 250 edges at random. Edges must go from
|
|
|
|
|
// lower-to-higher index vertices, so we get a DAG.
|
|
|
|
|
for (unsigned i = 0; i < 250; ++i) {
|
2018-09-21 00:09:19 +02:00
|
|
|
unsigned idx1 = V3Os::rand64(rngState) % 50;
|
|
|
|
|
unsigned idx2 = V3Os::rand64(rngState) % 50;
|
2018-07-23 02:54:28 +02:00
|
|
|
if (idx1 > idx2) {
|
|
|
|
|
new V3GraphEdge(&m_graph, m_vx[idx2], m_vx[idx1], 1);
|
|
|
|
|
} else if (idx2 > idx1) {
|
|
|
|
|
new V3GraphEdge(&m_graph, m_vx[idx1], m_vx[idx2], 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
partInitCriticalPaths(false);
|
|
|
|
|
|
|
|
|
|
// This SelfTest class is also the T_CostAccessor
|
2020-04-15 13:58:34 +02:00
|
|
|
PartPropagateCp<PartPropagateCpSelfTest> prop(&m_graph, GraphWay::FORWARD, this, true);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Seed the propagator with every input node;
|
|
|
|
|
// This should result in the complete graph getting all CP's assigned.
|
2020-11-11 04:10:38 +01:00
|
|
|
for (const auto& i : m_vx) {
|
|
|
|
|
if (!i->inBeginp()) prop.cpHasIncreased(i, 1 /* inclusive CP starts at 1 */);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Run the propagator.
|
|
|
|
|
// * The setCritPathCost() routine checks that each node's CP changes
|
|
|
|
|
// at most once.
|
|
|
|
|
// * The notifyEdgeCp routine is also self checking.
|
|
|
|
|
m_seen.clear();
|
|
|
|
|
prop.go();
|
|
|
|
|
|
|
|
|
|
// Finally, confirm that the entire graph appears to have correct CPs.
|
|
|
|
|
partInitCriticalPaths(true);
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
2020-04-15 13:58:34 +02:00
|
|
|
static void selfTest() { PartPropagateCpSelfTest().go(); }
|
2018-07-23 02:54:28 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// LogicMTask
|
|
|
|
|
|
2020-11-19 03:32:16 +01:00
|
|
|
class LogicMTask final : public AbstractLogicMTask {
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
|
|
|
|
// TYPES
|
2021-03-13 00:10:45 +01:00
|
|
|
using VxList = std::list<MTaskMoveVertex*>;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
struct CmpLogicMTask {
|
2020-04-15 13:58:34 +02:00
|
|
|
bool operator()(const LogicMTask* ap, const LogicMTask* bp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
return ap->id() < bp->id();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// This adaptor class allows the PartPropagateCp class to be somewhat
|
|
|
|
|
// independent of the LogicMTask class
|
|
|
|
|
// - PartPropagateCp can thus be declared before LogicMTask
|
|
|
|
|
// - PartPropagateCp could be reused with graphs of other node types
|
|
|
|
|
// in the future, using another Accessor adaptor.
|
2020-11-19 03:32:16 +01:00
|
|
|
class CpCostAccessor final {
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
2020-11-17 01:56:16 +01:00
|
|
|
CpCostAccessor() = default;
|
|
|
|
|
~CpCostAccessor() = default;
|
2018-07-23 02:54:28 +02:00
|
|
|
// Return cost of this node
|
|
|
|
|
uint32_t cost(const V3GraphVertex* vxp) const {
|
|
|
|
|
const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
|
|
|
|
|
return mtaskp->stepCost();
|
|
|
|
|
}
|
|
|
|
|
// Return stored CP to this node
|
|
|
|
|
uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const {
|
|
|
|
|
const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
|
|
|
|
|
return mtaskp->critPathCost(way);
|
|
|
|
|
}
|
|
|
|
|
// Store a new CP to this node
|
2020-04-15 13:58:34 +02:00
|
|
|
void setCritPathCost(V3GraphVertex* vxp, GraphWay way, uint32_t cost) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
|
|
|
|
|
mtaskp->setCritPathCost(way, cost);
|
|
|
|
|
}
|
|
|
|
|
// Notify vxp that the wayward CP at the throughp-->vxp edge
|
|
|
|
|
// has increased to 'cp'. (vxp is wayward from throughp.)
|
|
|
|
|
// This is our cue to update vxp's m_edges[!way][throughp].
|
2020-04-15 13:58:34 +02:00
|
|
|
void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way, V3GraphVertex* throuvhVxp,
|
|
|
|
|
uint32_t cp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* updateVxp = dynamic_cast<LogicMTask*>(vxp);
|
|
|
|
|
LogicMTask* lthrouvhVxp = dynamic_cast<LogicMTask*>(throuvhVxp);
|
|
|
|
|
EdgeSet& edges = updateVxp->m_edges[way.invert()];
|
|
|
|
|
uint32_t edgeCp = edges.at(lthrouvhVxp);
|
|
|
|
|
if (cp > edgeCp) edges.set(lthrouvhVxp, cp);
|
|
|
|
|
}
|
|
|
|
|
// Check that CP matches that of the longest edge wayward of vxp.
|
2020-04-15 13:58:34 +02:00
|
|
|
void checkNewCpVersusEdges(V3GraphVertex* vxp, GraphWay way, uint32_t cp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
|
|
|
|
|
EdgeSet& edges = mtaskp->m_edges[way.invert()];
|
|
|
|
|
// This is mtaskp's relative with longest !wayward inclusive CP:
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto edgeIt = edges.rbegin();
|
2018-07-23 02:54:28 +02:00
|
|
|
uint32_t edgeCp = (*edgeIt).value();
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(edgeCp == cp, vxp, "CP doesn't match longest wayward edge");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
VL_UNCOPYABLE(CpCostAccessor);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
// MEMBERS
|
|
|
|
|
|
|
|
|
|
// Set of MTaskMoveVertex's assigned to this mtask. LogicMTask does not
|
|
|
|
|
// own the MTaskMoveVertex objects, we merely keep pointers to them
|
|
|
|
|
// here.
|
|
|
|
|
VxList m_vertices;
|
|
|
|
|
|
|
|
|
|
// Cost estimate for this LogicMTask, derived from V3InstrCount.
|
|
|
|
|
// In abstract time units.
|
2020-08-15 19:11:27 +02:00
|
|
|
uint32_t m_cost = 0;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Cost of critical paths going FORWARD from graph-start to the start
|
|
|
|
|
// of this vertex, and also going REVERSE from the end of the graph to
|
|
|
|
|
// the end of the vertex. Same units as m_cost.
|
2020-11-15 22:21:26 +01:00
|
|
|
std::array<uint32_t, GraphWay::NUM_WAYS> m_critPathCost;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
uint32_t m_serialId; // Unique MTask ID number
|
|
|
|
|
|
|
|
|
|
// Count "generations" which are just operations that scan through the
|
|
|
|
|
// graph. We'll mark each node with the last generation that scanned
|
|
|
|
|
// it. We can use this to avoid recursing through the same node twice
|
|
|
|
|
// while searching for a path.
|
2020-08-15 19:11:27 +02:00
|
|
|
vluint64_t m_generation = 0;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Redundant with the V3GraphEdge's, store a map of relatives so we can
|
|
|
|
|
// quickly check if we have a given parent or child.
|
|
|
|
|
//
|
|
|
|
|
// 'm_edges[way]' maps a wayward relative to the !way critical path at
|
|
|
|
|
// our edge with them. The SortByValueMap supports iterating over
|
|
|
|
|
// relatives in longest-to-shortest CP order. We rely on this ordering
|
|
|
|
|
// in more than one place.
|
2021-03-13 00:10:45 +01:00
|
|
|
using EdgeSet = SortByValueMap<LogicMTask*, uint32_t, CmpLogicMTask>;
|
2020-11-15 22:21:26 +01:00
|
|
|
std::array<EdgeSet, GraphWay::NUM_WAYS> m_edges;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
|
|
|
|
LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp)
|
2020-08-16 15:55:36 +02:00
|
|
|
: AbstractLogicMTask{graphp} {
|
2020-11-11 04:10:38 +01:00
|
|
|
for (unsigned int& i : m_critPathCost) i = 0;
|
2018-07-23 02:54:28 +02:00
|
|
|
if (mtmvVxp) { // Else null for test
|
|
|
|
|
m_vertices.push_back(mtmvVxp);
|
|
|
|
|
if (OrderLogicVertex* olvp = mtmvVxp->logicp()) {
|
|
|
|
|
m_cost += V3InstrCount::count(olvp->nodep(), true);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Start at 1, so that 0 indicates no mtask ID.
|
|
|
|
|
static uint32_t s_nextId = 1;
|
|
|
|
|
m_serialId = s_nextId++;
|
|
|
|
|
UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// METHODS
|
|
|
|
|
void moveAllVerticesFrom(LogicMTask* otherp) {
|
|
|
|
|
// splice() is constant time
|
|
|
|
|
m_vertices.splice(m_vertices.end(), otherp->m_vertices);
|
|
|
|
|
m_cost += otherp->m_cost;
|
|
|
|
|
}
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual const VxList* vertexListp() const override { return &m_vertices; }
|
2018-07-23 02:54:28 +02:00
|
|
|
static vluint64_t incGeneration() {
|
|
|
|
|
static vluint64_t s_generation = 0;
|
|
|
|
|
++s_generation;
|
|
|
|
|
return s_generation;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Use this instead of pointer-compares to compare LogicMTasks. Avoids
|
|
|
|
|
// nondeterministic output. Also name mtasks based on this number in
|
|
|
|
|
// the final C++ output.
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual uint32_t id() const override { return m_serialId; }
|
2018-07-23 02:54:28 +02:00
|
|
|
void id(uint32_t id) { m_serialId = id; }
|
|
|
|
|
// Abstract cost of every logic mtask
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual uint32_t cost() const override { return m_cost; }
|
2018-07-23 02:54:28 +02:00
|
|
|
void setCost(uint32_t cost) { m_cost = cost; } // For tests only
|
|
|
|
|
uint32_t stepCost() const { return stepCost(m_cost); }
|
|
|
|
|
static uint32_t stepCost(uint32_t cost) {
|
|
|
|
|
#if PART_STEPPED_COST
|
|
|
|
|
// Round cost up to the nearest 5%. Use this when computing all
|
|
|
|
|
// critical paths. The idea is that critical path changes don't
|
|
|
|
|
// need to propagate when they don't exceed the next step, saving a
|
|
|
|
|
// lot of recursion.
|
|
|
|
|
if (cost == 0) return 0;
|
|
|
|
|
|
|
|
|
|
double logcost = log(cost);
|
|
|
|
|
// log(1.05) is about 0.05
|
|
|
|
|
// So, round logcost up to the next 0.05 boundary
|
|
|
|
|
logcost *= 20.0;
|
|
|
|
|
logcost = ceil(logcost);
|
|
|
|
|
logcost = logcost / 20.0;
|
|
|
|
|
|
2018-10-15 00:39:33 +02:00
|
|
|
uint32_t stepCost = static_cast<uint32_t>(exp(logcost));
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded");
|
|
|
|
|
UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded");
|
|
|
|
|
return stepCost;
|
|
|
|
|
#else
|
|
|
|
|
return cost;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void addRelative(GraphWay way, LogicMTask* relativep) {
|
|
|
|
|
EdgeSet& edges = m_edges[way];
|
|
|
|
|
UASSERT(!edges.has(relativep), "Adding existing edge");
|
|
|
|
|
// value is !way cp to this edge
|
2020-04-15 13:58:34 +02:00
|
|
|
edges.set(relativep, relativep->stepCost() + relativep->critPathCost(way.invert()));
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
void removeRelative(GraphWay way, LogicMTask* relativep) {
|
|
|
|
|
EdgeSet& edges = m_edges[way];
|
|
|
|
|
edges.erase(relativep);
|
|
|
|
|
}
|
|
|
|
|
bool hasRelative(GraphWay way, LogicMTask* relativep) {
|
2020-02-04 05:21:56 +01:00
|
|
|
const EdgeSet& edges = m_edges[way];
|
2018-07-23 02:54:28 +02:00
|
|
|
return edges.has(relativep);
|
|
|
|
|
}
|
|
|
|
|
void checkRelativesCp(GraphWay way) const {
|
|
|
|
|
const EdgeSet& edges = m_edges[way];
|
2020-04-15 13:58:34 +02:00
|
|
|
for (EdgeSet::const_reverse_iterator it = edges.rbegin(); it != edges.rend(); ++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* relativep = (*it).key();
|
|
|
|
|
uint32_t cachedCp = (*it).value();
|
2020-04-15 13:58:34 +02:00
|
|
|
partCheckCachedScoreVsActual(cachedCp, relativep->critPathCost(way.invert())
|
|
|
|
|
+ relativep->stepCost());
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual string name() const override {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Display forward and reverse critical path costs. This gives a quick
|
|
|
|
|
// read on whether graph partitioning looks reasonable or bad.
|
|
|
|
|
std::ostringstream out;
|
2020-04-15 13:58:34 +02:00
|
|
|
out << "mt" << m_serialId << "." << this << " [b" << m_critPathCost[GraphWay::FORWARD]
|
|
|
|
|
<< " a" << m_critPathCost[GraphWay::REVERSE] << " c" << cost();
|
2018-07-23 02:54:28 +02:00
|
|
|
return out.str();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; }
|
|
|
|
|
uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; }
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Compute the critical path cost wayward to this node, without
|
|
|
|
|
// considering edge 'withoutp'
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT(this == withoutp->furtherp(way), "In critPathCostWithout(), edge 'withoutp' must "
|
|
|
|
|
"further to 'this'");
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Iterate through edges until we get a relative other than
|
|
|
|
|
// wayEdgeEndp(way, withoutp). This should take 2 iterations max.
|
|
|
|
|
const EdgeSet& edges = m_edges[way.invert()];
|
|
|
|
|
uint32_t result = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (EdgeSet::const_reverse_iterator it = edges.rbegin(); it != edges.rend(); ++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
if ((*it).key() != withoutp->furtherp(way.invert())) {
|
|
|
|
|
// Use the cached cost. It could be a small overestimate
|
|
|
|
|
// due to stepping. This is consistent with critPathCost()
|
|
|
|
|
// which also returns the cached cost.
|
|
|
|
|
result = (*it).value();
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2020-04-15 13:58:34 +02:00
|
|
|
static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top,
|
|
|
|
|
const V3GraphEdge* excludedEdgep, vluint64_t generation) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Q) Why does this take LogicMTask instead of generic V3GraphVertex?
|
|
|
|
|
// A) We'll use the critical paths known to LogicMTask to prune the
|
|
|
|
|
// recursion for speed. Also store 'generation' in
|
|
|
|
|
// LogicMTask::m_generation so we can prune the search and avoid
|
|
|
|
|
// recursing through the same node more than once in a single
|
|
|
|
|
// search.
|
|
|
|
|
|
|
|
|
|
if (fromp->m_generation == generation) {
|
|
|
|
|
// Already looked at this node in the current search.
|
|
|
|
|
// Since we're back again, we must not have found a path on the
|
|
|
|
|
// first go.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
fromp->m_generation = generation;
|
|
|
|
|
|
|
|
|
|
// Base case: we found a path.
|
|
|
|
|
if (fromp == top) return true;
|
|
|
|
|
|
|
|
|
|
// Base case: fromp is too late, cannot possibly be a prereq for top.
|
|
|
|
|
if (fromp->critPathCost(GraphWay::REVERSE)
|
2020-04-15 13:58:34 +02:00
|
|
|
< (top->critPathCost(GraphWay::REVERSE) + top->stepCost())) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
if ((fromp->critPathCost(GraphWay::FORWARD) + fromp->stepCost())
|
2020-04-15 13:58:34 +02:00
|
|
|
> top->critPathCost(GraphWay::FORWARD)) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Recursively look for a path
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphEdge* followp = fromp->outBeginp(); followp;
|
|
|
|
|
followp = followp->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
if (followp == excludedEdgep) continue;
|
|
|
|
|
LogicMTask* nextp = dynamic_cast<LogicMTask*>(followp->top());
|
2020-08-15 16:12:55 +02:00
|
|
|
if (pathExistsFromInternal(nextp, top, nullptr, generation)) return true;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// True if there's a path from 'fromp' to 'top' excluding
|
|
|
|
|
// 'excludedEdgep', false otherwise.
|
|
|
|
|
//
|
2020-08-15 16:12:55 +02:00
|
|
|
// 'excludedEdgep' may be nullptr in which case no edge is excluded. If
|
|
|
|
|
// 'excludedEdgep' is non-nullptr it must connect fromp and top.
|
2018-07-23 02:54:28 +02:00
|
|
|
//
|
|
|
|
|
// TODO: consider changing this API to the 'isTransitiveEdge' API
|
|
|
|
|
// used by GraphPathChecker
|
|
|
|
|
public:
|
2020-04-15 13:58:34 +02:00
|
|
|
static bool pathExistsFrom(LogicMTask* fromp, LogicMTask* top,
|
2018-07-23 02:54:28 +02:00
|
|
|
const V3GraphEdge* excludedEdgep) {
|
2020-04-15 13:58:34 +02:00
|
|
|
return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration());
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) {
|
2021-06-21 00:32:57 +02:00
|
|
|
const string filename = v3Global.debugFilename(nameComment) + ".txt";
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(1, "Writing " << filename << endl);
|
2020-08-15 16:03:34 +02:00
|
|
|
std::unique_ptr<std::ofstream> ofp(V3File::new_ofstream(filename));
|
2018-07-23 02:54:28 +02:00
|
|
|
std::ostream* osp = &(*ofp); // &* needed to deref unique_ptr
|
2020-04-15 13:58:34 +02:00
|
|
|
if (osp->fail()) v3fatalStatic("Can't write " << filename);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Find start vertex with longest CP
|
2020-08-15 16:12:55 +02:00
|
|
|
const LogicMTask* startp = nullptr;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp;
|
|
|
|
|
vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
|
|
|
|
|
if (!startp) {
|
|
|
|
|
startp = mtaskp;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE)
|
|
|
|
|
> startp->cost() + startp->critPathCost(GraphWay::REVERSE)) {
|
|
|
|
|
startp = mtaskp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Follow the entire critical path
|
|
|
|
|
std::vector<const LogicMTask*> path;
|
|
|
|
|
uint32_t totalCost = 0;
|
|
|
|
|
for (const LogicMTask* nextp = startp; nextp;) {
|
|
|
|
|
path.push_back(nextp);
|
|
|
|
|
totalCost += nextp->cost();
|
|
|
|
|
|
|
|
|
|
const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD];
|
|
|
|
|
EdgeSet::const_reverse_iterator it = children.rbegin();
|
2020-04-15 13:58:34 +02:00
|
|
|
if (it == children.rend()) {
|
2020-08-15 16:12:55 +02:00
|
|
|
nextp = nullptr;
|
2020-04-15 13:58:34 +02:00
|
|
|
} else {
|
|
|
|
|
nextp = (*it).key();
|
|
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
*osp << "totalCost = " << totalCost
|
|
|
|
|
<< " (should match the computed critical path cost (CP) for the graph)\n";
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Dump
|
2020-08-16 17:43:49 +02:00
|
|
|
for (const LogicMTask* mtaskp : path) {
|
2020-11-19 03:03:23 +01:00
|
|
|
*osp << "begin mtask with cost " << mtaskp->cost() << '\n';
|
2018-07-23 02:54:28 +02:00
|
|
|
for (VxList::const_iterator lit = mtaskp->vertexListp()->begin();
|
|
|
|
|
lit != mtaskp->vertexListp()->end(); ++lit) {
|
|
|
|
|
const OrderLogicVertex* logicp = (*lit)->logicp();
|
|
|
|
|
if (!logicp) continue;
|
2020-04-04 04:31:54 +02:00
|
|
|
if (false) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Show nodes only
|
2020-04-15 13:58:34 +02:00
|
|
|
*osp << "> ";
|
|
|
|
|
logicp->nodep()->dumpTree(*osp);
|
2018-07-23 02:54:28 +02:00
|
|
|
} else {
|
|
|
|
|
// Show nodes with hierarchical costs
|
|
|
|
|
V3InstrCount::count(logicp->nodep(), false, osp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_DEBUG_FUNC; // Declare debug()
|
|
|
|
|
VL_UNCOPYABLE(LogicMTask);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// MTask utility classes
|
|
|
|
|
|
|
|
|
|
// Sort AbstractMTask objects into deterministic order by calling id()
|
|
|
|
|
// which is a unique and stable serial number.
|
2020-11-19 03:32:16 +01:00
|
|
|
class MTaskIdLessThan final {
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
2020-11-17 01:56:16 +01:00
|
|
|
MTaskIdLessThan() = default;
|
|
|
|
|
virtual ~MTaskIdLessThan() = default;
|
2020-04-15 13:58:34 +02:00
|
|
|
virtual bool operator()(const AbstractMTask* lhsp, const AbstractMTask* rhsp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
return lhsp->id() < rhsp->id();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Information associated with scoreboarding an MTask
|
2020-11-19 03:32:16 +01:00
|
|
|
class MergeCandidate VL_NOT_FINAL {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
2020-08-15 19:11:27 +02:00
|
|
|
bool m_removedFromSb = false; // Not on scoreboard, generally ignore
|
2018-07-23 02:54:28 +02:00
|
|
|
vluint64_t m_id; // Serial number for ordering
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
2020-08-15 19:11:27 +02:00
|
|
|
MergeCandidate() {
|
2018-07-23 02:54:28 +02:00
|
|
|
static vluint64_t serial = 0;
|
|
|
|
|
++serial;
|
|
|
|
|
m_id = serial;
|
|
|
|
|
}
|
|
|
|
|
virtual bool mergeWouldCreateCycle() const = 0;
|
|
|
|
|
// METHODS
|
|
|
|
|
bool removedFromSb() const { return m_removedFromSb; }
|
|
|
|
|
void removedFromSb(bool removed) { m_removedFromSb = removed; }
|
2020-04-15 13:58:34 +02:00
|
|
|
bool operator<(const MergeCandidate& other) const { return m_id < other.m_id; }
|
2018-07-23 02:54:28 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// A pair of associated LogicMTask's that are merge candidates for sibling
|
|
|
|
|
// contraction
|
2020-11-19 03:32:16 +01:00
|
|
|
class SiblingMC final : public MergeCandidate {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
LogicMTask* m_ap;
|
|
|
|
|
LogicMTask* m_bp;
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
2020-11-11 03:40:14 +01:00
|
|
|
// CONSTRUCTORS
|
|
|
|
|
SiblingMC() = delete;
|
2018-07-23 02:54:28 +02:00
|
|
|
SiblingMC(LogicMTask* ap, LogicMTask* bp) {
|
|
|
|
|
// Assign 'ap' and 'bp' in a canonical order, so we can more easily
|
|
|
|
|
// compare pairs of SiblingMCs
|
|
|
|
|
if (ap->id() > bp->id()) {
|
|
|
|
|
m_ap = ap;
|
|
|
|
|
m_bp = bp;
|
|
|
|
|
} else {
|
|
|
|
|
m_ap = bp;
|
|
|
|
|
m_bp = ap;
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-11-17 01:56:16 +01:00
|
|
|
virtual ~SiblingMC() = default;
|
2018-07-23 02:54:28 +02:00
|
|
|
// METHODS
|
|
|
|
|
LogicMTask* ap() const { return m_ap; }
|
|
|
|
|
LogicMTask* bp() const { return m_bp; }
|
2020-11-11 03:40:14 +01:00
|
|
|
bool mergeWouldCreateCycle() const override {
|
2020-08-15 16:12:55 +02:00
|
|
|
return (LogicMTask::pathExistsFrom(m_ap, m_bp, nullptr)
|
|
|
|
|
|| LogicMTask::pathExistsFrom(m_bp, m_ap, nullptr));
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
bool operator<(const SiblingMC& other) const {
|
2021-02-22 03:25:21 +01:00
|
|
|
if (m_ap->id() < other.m_ap->id()) return true;
|
|
|
|
|
if (m_ap->id() > other.m_ap->id()) return false;
|
2018-07-23 02:54:28 +02:00
|
|
|
return m_bp->id() < other.m_bp->id();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// GraphEdge for the MTask graph
|
2020-11-19 03:32:16 +01:00
|
|
|
class MTaskEdge final : public V3GraphEdge, public MergeCandidate {
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
|
|
|
|
MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight)
|
2020-08-16 15:55:36 +02:00
|
|
|
: V3GraphEdge{graphp, fromp, top, weight} {
|
2018-07-23 02:54:28 +02:00
|
|
|
fromp->addRelative(GraphWay::FORWARD, top);
|
|
|
|
|
top->addRelative(GraphWay::REVERSE, fromp);
|
|
|
|
|
}
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual ~MTaskEdge() override {
|
2018-07-23 02:54:28 +02:00
|
|
|
fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp());
|
|
|
|
|
toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp());
|
|
|
|
|
}
|
|
|
|
|
// METHODS
|
|
|
|
|
LogicMTask* furtherMTaskp(GraphWay way) const {
|
|
|
|
|
return dynamic_cast<LogicMTask*>(this->furtherp(way));
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
LogicMTask* fromMTaskp() const { return dynamic_cast<LogicMTask*>(fromp()); }
|
|
|
|
|
LogicMTask* toMTaskp() const { return dynamic_cast<LogicMTask*>(top()); }
|
2020-08-15 17:44:10 +02:00
|
|
|
virtual bool mergeWouldCreateCycle() const override {
|
2018-07-23 02:54:28 +02:00
|
|
|
return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this);
|
|
|
|
|
}
|
|
|
|
|
static MTaskEdge* cast(V3GraphEdge* edgep) {
|
2020-08-15 16:12:55 +02:00
|
|
|
if (!edgep) return nullptr;
|
2018-07-23 02:54:28 +02:00
|
|
|
MTaskEdge* resultp = dynamic_cast<MTaskEdge*>(edgep);
|
|
|
|
|
UASSERT(resultp, "Failed to cast in MTaskEdge::cast");
|
|
|
|
|
return resultp;
|
|
|
|
|
}
|
|
|
|
|
// Following initial assignment of critical paths, clear this MTaskEdge
|
|
|
|
|
// out of the edge-map for each node and reinsert at a new location
|
|
|
|
|
// with updated critical path.
|
|
|
|
|
void resetCriticalPaths() {
|
|
|
|
|
LogicMTask* fromp = fromMTaskp();
|
|
|
|
|
LogicMTask* top = toMTaskp();
|
|
|
|
|
fromp->removeRelative(GraphWay::FORWARD, top);
|
|
|
|
|
top->removeRelative(GraphWay::REVERSE, fromp);
|
|
|
|
|
fromp->addRelative(GraphWay::FORWARD, top);
|
|
|
|
|
top->addRelative(GraphWay::REVERSE, fromp);
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
VL_UNCOPYABLE(MTaskEdge);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// Vertex utility classes
|
|
|
|
|
|
2020-11-19 03:32:16 +01:00
|
|
|
class OrderByPtrId final {
|
2018-07-23 02:54:28 +02:00
|
|
|
PartPtrIdMap m_ids;
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
2020-04-15 13:58:34 +02:00
|
|
|
virtual bool operator()(const OrderVarStdVertex* lhsp, const OrderVarStdVertex* rhsp) const {
|
2018-07-23 02:54:28 +02:00
|
|
|
vluint64_t l_id = m_ids.findId(lhsp);
|
|
|
|
|
vluint64_t r_id = m_ids.findId(rhsp);
|
|
|
|
|
return l_id < r_id;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// PartParallelismEst - Estimate parallelism of graph
|
|
|
|
|
|
2020-11-19 03:32:16 +01:00
|
|
|
class PartParallelismEst final {
|
2018-07-23 02:54:28 +02:00
|
|
|
// MEMBERS
|
|
|
|
|
const V3Graph* m_graphp; // Mtask-containing graph
|
|
|
|
|
|
|
|
|
|
// Total cost of evaluating the whole graph.
|
|
|
|
|
// The ratio of m_totalGraphCost to longestCpCost gives us an estimate
|
|
|
|
|
// of the parallelizability of this graph which is only as good as the
|
|
|
|
|
// guess returned by LogicMTask::cost().
|
2020-08-15 19:11:27 +02:00
|
|
|
uint32_t m_totalGraphCost = 0;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Cost of the longest critical path, in abstract units (the same units
|
|
|
|
|
// returned by the vertexCost)
|
2020-08-15 19:11:27 +02:00
|
|
|
uint32_t m_longestCpCost = 0;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2020-08-15 19:11:27 +02:00
|
|
|
size_t m_vertexCount = 0; // Number of vertexes calculated
|
|
|
|
|
size_t m_edgeCount = 0; // Number of edges calculated
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
|
|
|
|
explicit PartParallelismEst(const V3Graph* graphp)
|
2020-08-16 15:55:36 +02:00
|
|
|
: m_graphp{graphp} {}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// METHODS
|
|
|
|
|
uint32_t totalGraphCost() const { return m_totalGraphCost; }
|
|
|
|
|
uint32_t longestCritPathCost() const { return m_longestCpCost; }
|
|
|
|
|
size_t vertexCount() const { return m_vertexCount; }
|
|
|
|
|
size_t edgeCount() const { return m_edgeCount; }
|
|
|
|
|
double parallelismFactor() const {
|
|
|
|
|
return (static_cast<double>(m_totalGraphCost) / m_longestCpCost);
|
|
|
|
|
}
|
|
|
|
|
void traverse() {
|
|
|
|
|
// For each node, record the critical path cost from the start
|
|
|
|
|
// of the graph through the end of the node.
|
2020-08-15 16:03:34 +02:00
|
|
|
std::unordered_map<const V3GraphVertex*, uint32_t> critPaths;
|
2018-07-23 02:54:28 +02:00
|
|
|
GraphStreamUnordered serialize(m_graphp);
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphVertex* vertexp; (vertexp = serialize.nextp());) {
|
2018-07-23 02:54:28 +02:00
|
|
|
m_vertexCount++;
|
|
|
|
|
uint32_t cpCostToHere = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
++m_edgeCount;
|
|
|
|
|
// For each upstream item, add its critical path cost to
|
|
|
|
|
// the cost of this edge, to form a new candidate critical
|
|
|
|
|
// path cost to the current node. Whichever is largest is
|
|
|
|
|
// the critical path to reach the start of this node.
|
|
|
|
|
cpCostToHere = std::max(cpCostToHere, critPaths[edgep->fromp()]);
|
|
|
|
|
}
|
|
|
|
|
// Include the cost of the current vertex in the critical
|
|
|
|
|
// path, so it represents the critical path to the end of
|
|
|
|
|
// this vertex.
|
|
|
|
|
cpCostToHere += vertexCost(vertexp);
|
|
|
|
|
critPaths[vertexp] = cpCostToHere;
|
|
|
|
|
m_longestCpCost = std::max(m_longestCpCost, cpCostToHere);
|
|
|
|
|
// Tally the total cost contributed by vertices.
|
|
|
|
|
m_totalGraphCost += vertexCost(vertexp);
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-08-16 20:55:46 +02:00
|
|
|
void statsReport(const string& stage) const {
|
2020-04-15 13:58:34 +02:00
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", critical path cost", m_longestCpCost);
|
|
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", total graph cost", m_totalGraphCost);
|
|
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", mtask count", m_vertexCount);
|
|
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", edge count", m_edgeCount);
|
|
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", parallelism factor", parallelismFactor());
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2020-08-16 20:55:46 +02:00
|
|
|
void debugReport() const {
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(0, " Critical path cost = " << m_longestCpCost << endl);
|
|
|
|
|
UINFO(0, " Total graph cost = " << m_totalGraphCost << endl);
|
|
|
|
|
UINFO(0, " MTask vertex count = " << m_vertexCount << endl);
|
|
|
|
|
UINFO(0, " Edge count = " << m_edgeCount << endl);
|
|
|
|
|
UINFO(0, " Parallelism factor = " << parallelismFactor() << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
static uint32_t vertexCost(const V3GraphVertex* vertexp) {
|
|
|
|
|
return dynamic_cast<const AbstractMTask*>(vertexp)->cost();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_DEBUG_FUNC; // Declare debug()
|
|
|
|
|
VL_UNCOPYABLE(PartParallelismEst);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
|
|
|
|
|
// Look at vertex costs (in one way) to form critical paths for each
|
|
|
|
|
// vertex.
|
|
|
|
|
static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool checkOnly) {
|
|
|
|
|
GraphStreamUnordered order(mtasksp, way);
|
2021-06-21 00:32:57 +02:00
|
|
|
const GraphWay rev = way.invert();
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphVertex* vertexp; (vertexp = order.nextp());) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const LogicMTask* mtaskcp = dynamic_cast<const LogicMTask*>(vertexp);
|
|
|
|
|
LogicMTask* mtaskp = const_cast<LogicMTask*>(mtaskcp);
|
|
|
|
|
uint32_t cpCost = 0;
|
2020-08-15 16:03:34 +02:00
|
|
|
std::unordered_set<V3GraphVertex*> relatives;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = vertexp->beginp(rev); edgep; edgep = edgep->nextp(rev)) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Run a few asserts on the initial mtask graph,
|
|
|
|
|
// while we're iterating through...
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT_OBJ(edgep->weight() != 0, mtaskp, "Should be no cut edges in mtasks graph");
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(relatives.find(edgep->furtherp(rev)) == relatives.end(), mtaskp,
|
|
|
|
|
"Should be no redundant edges in mtasks graph");
|
2018-07-23 02:54:28 +02:00
|
|
|
relatives.insert(edgep->furtherp(rev));
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
LogicMTask* relativep = dynamic_cast<LogicMTask*>(edgep->furtherp(rev));
|
|
|
|
|
cpCost = std::max(cpCost, (relativep->critPathCost(way)
|
|
|
|
|
+ static_cast<uint32_t>(relativep->stepCost())));
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
if (checkOnly) {
|
|
|
|
|
partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost);
|
|
|
|
|
} else {
|
|
|
|
|
mtaskp->setCritPathCost(way, cpCost);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Look at vertex costs to form critical paths for each vertex.
|
|
|
|
|
static void partInitCriticalPaths(V3Graph* mtasksp) {
|
|
|
|
|
partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false);
|
|
|
|
|
partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false);
|
|
|
|
|
|
|
|
|
|
// Reset all MTaskEdges so that 'm_edges' will show correct CP numbers.
|
|
|
|
|
// They would have been all zeroes on initial creation of the MTaskEdges.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
|
|
|
|
for (V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
MTaskEdge* mtedgep = dynamic_cast<MTaskEdge*>(edgep);
|
|
|
|
|
mtedgep->resetCriticalPaths();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Do an EXPENSIVE check to make sure that all incremental CP updates have
|
|
|
|
|
// gone correctly.
|
|
|
|
|
static void partCheckCriticalPaths(V3Graph* mtasksp) {
|
|
|
|
|
partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true);
|
|
|
|
|
partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true);
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
|
|
|
|
|
mtaskp->checkRelativesCp(GraphWay::FORWARD);
|
|
|
|
|
mtaskp->checkRelativesCp(GraphWay::REVERSE);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Advance to nextp(way) and delete edge
|
|
|
|
|
static V3GraphEdge* partBlastEdgep(GraphWay way, V3GraphEdge* edgep) {
|
|
|
|
|
V3GraphEdge* nextp = edgep->nextp(way);
|
2020-01-17 02:17:11 +01:00
|
|
|
VL_DO_DANGLING(edgep->unlinkDelete(), edgep);
|
2018-07-23 02:54:28 +02:00
|
|
|
return nextp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Merge edges from a LogicMtask.
|
|
|
|
|
//
|
|
|
|
|
// This code removes 'hasRelative' edges. When this occurs, mark it in need
|
|
|
|
|
// of a rescore, in case its score has fallen and we need to move it up
|
|
|
|
|
// toward the front of the scoreboard.
|
|
|
|
|
//
|
|
|
|
|
// Wait, whaaat? Shouldn't the scores only increase as we merge nodes? Well
|
|
|
|
|
// that's almost true. But there is one exception.
|
|
|
|
|
//
|
|
|
|
|
// Suppose we have A->B, B->C, and A->C.
|
|
|
|
|
//
|
|
|
|
|
// The A->C edge is a "transitive" edge. It's ineligible to be merged, as
|
|
|
|
|
// the merge would create a cycle. We score it on the scoreboard like any
|
|
|
|
|
// other edge.
|
|
|
|
|
//
|
|
|
|
|
// However, our "score" estimate for A->C is bogus, because the forward
|
|
|
|
|
// critical path to C and the reverse critical path to A both contain the
|
|
|
|
|
// same node (B) so we overestimate the score of A->C. At first this
|
|
|
|
|
// doesn't matter, since transitive edges aren't eligible to merge anyway.
|
|
|
|
|
//
|
|
|
|
|
// Later, suppose the edge contractor decides to merge the B->C edge, with
|
|
|
|
|
// B donating all its incoming edges into C, say. (So we reach this
|
|
|
|
|
// function.)
|
|
|
|
|
//
|
|
|
|
|
// With B going away, the A->C edge will no longer be transitive and it
|
|
|
|
|
// will become eligible to merge. But if we don't mark it for rescore,
|
|
|
|
|
// it'll stay in the scoreboard with its old (overestimate) score. We'll
|
|
|
|
|
// merge it too late due to the bogus score. When we finally merge it, we
|
|
|
|
|
// fail the assert in the main edge contraction loop which checks that the
|
|
|
|
|
// actual score did not fall below the scoreboard's score.
|
|
|
|
|
//
|
|
|
|
|
// Another way of stating this: this code ensures that scores of
|
|
|
|
|
// non-transitive edges only ever increase.
|
2020-04-15 13:58:34 +02:00
|
|
|
static void partMergeEdgesFrom(V3Graph* mtasksp, LogicMTask* recipientp, LogicMTask* donorp,
|
2018-07-23 02:54:28 +02:00
|
|
|
V3Scoreboard<MergeCandidate, uint32_t>* sbp) {
|
|
|
|
|
for (unsigned wi = 0; wi < 2; ++wi) {
|
2021-06-21 00:32:57 +02:00
|
|
|
const GraphWay way = wi ? GraphWay::REVERSE : GraphWay::FORWARD;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = donorp->beginp(way); edgep; edgep = partBlastEdgep(way, edgep)) {
|
2018-07-23 02:54:28 +02:00
|
|
|
MTaskEdge* tedgep = MTaskEdge::cast(edgep);
|
2020-04-15 13:58:34 +02:00
|
|
|
if (sbp && !tedgep->removedFromSb()) sbp->removeElem(tedgep);
|
2018-07-23 02:54:28 +02:00
|
|
|
// Existing edge; mark it in need of a rescore
|
|
|
|
|
if (recipientp->hasRelative(way, tedgep->furtherMTaskp(way))) {
|
|
|
|
|
if (sbp) {
|
2020-04-15 13:58:34 +02:00
|
|
|
MTaskEdge* existMTaskEdgep = MTaskEdge::cast(
|
|
|
|
|
recipientp->findConnectingEdgep(way, tedgep->furtherMTaskp(way)));
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");
|
|
|
|
|
if (!existMTaskEdgep->removedFromSb()) {
|
|
|
|
|
sbp->hintScoreChanged(existMTaskEdgep);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// No existing edge into *this, make one.
|
|
|
|
|
MTaskEdge* newEdgep;
|
|
|
|
|
if (way == GraphWay::REVERSE) {
|
2020-04-15 13:58:34 +02:00
|
|
|
newEdgep = new MTaskEdge(mtasksp, tedgep->fromMTaskp(), recipientp, 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
} else {
|
2020-04-15 13:58:34 +02:00
|
|
|
newEdgep = new MTaskEdge(mtasksp, recipientp, tedgep->toMTaskp(), 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
if (sbp) sbp->addElem(newEdgep);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// PartContraction
|
|
|
|
|
|
|
|
|
|
// Perform edge or sibling contraction on the partition graph
|
2020-11-19 03:32:16 +01:00
|
|
|
class PartContraction final {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
// TYPES
|
|
|
|
|
|
|
|
|
|
// TODO: might get a little more speed by making this a
|
2020-08-15 16:03:34 +02:00
|
|
|
// std::unordered_set and defining hash and equal_to functors for the
|
2018-07-23 02:54:28 +02:00
|
|
|
// SiblingMC:
|
2021-03-13 00:10:45 +01:00
|
|
|
using SibSet = std::set<SiblingMC>;
|
|
|
|
|
using SibpSet = std::unordered_set<const SiblingMC*>;
|
|
|
|
|
using MTask2Sibs = std::unordered_map<const LogicMTask*, SibpSet>;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// New CP information for mtaskp reflecting an upcoming merge
|
|
|
|
|
struct NewCp {
|
|
|
|
|
uint32_t cp;
|
|
|
|
|
uint32_t propagateCp;
|
|
|
|
|
bool propagate;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// MEMBERS
|
|
|
|
|
V3Graph* m_mtasksp; // Mtask graph
|
|
|
|
|
uint32_t m_scoreLimit; // Sloppy score allowed when picking merges
|
2020-08-15 19:11:27 +02:00
|
|
|
uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at
|
|
|
|
|
unsigned m_mergesSinceRescore = 0; // Merges since last rescore
|
2018-07-23 02:54:28 +02:00
|
|
|
bool m_slowAsserts; // Take extra time to validate algorithm
|
|
|
|
|
V3Scoreboard<MergeCandidate, uint32_t> m_sb; // Scoreboard
|
|
|
|
|
SibSet m_pairs; // Storage for each SiblingMC
|
|
|
|
|
MTask2Sibs m_mtask2sibs; // SiblingMC set for each mtask
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
|
|
|
|
PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts)
|
2020-08-16 15:55:36 +02:00
|
|
|
: m_mtasksp{mtasksp}
|
|
|
|
|
, m_scoreLimit{scoreLimit}
|
|
|
|
|
, m_slowAsserts{slowAsserts}
|
|
|
|
|
, m_sb{&mergeCandidateScore, slowAsserts} {}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// METHODS
|
|
|
|
|
void go() {
|
|
|
|
|
unsigned maxMTasks = v3Global.opt.threadsMaxMTasks();
|
|
|
|
|
if (maxMTasks == 0) { // Unspecified so estimate
|
|
|
|
|
if (v3Global.opt.threads() > 1) {
|
2020-04-15 13:58:34 +02:00
|
|
|
maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD * v3Global.opt.threads());
|
2018-07-23 02:54:28 +02:00
|
|
|
} else {
|
|
|
|
|
// Running PartContraction with --threads <= 1 means self-test
|
|
|
|
|
maxMTasks = 500;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// OPTIMIZATION PASS: Edge contraction and sibling contraction.
|
|
|
|
|
// - Score each pair of mtasks which is a candidate to merge.
|
|
|
|
|
// * Each edge defines such a candidate pair
|
|
|
|
|
// * Two mtasks that are prereqs or postreqs of a common third
|
|
|
|
|
// vertex are "siblings", these are also a candidate pair.
|
|
|
|
|
// - Build a list of MergeCandidates, sorted by score.
|
|
|
|
|
// - Merge the best pair.
|
|
|
|
|
// - Incrementally recompute critical paths near the merged mtask.
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) {
|
2020-08-15 16:03:34 +02:00
|
|
|
std::unordered_set<const V3GraphVertex*> neighbors;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
m_sb.addElem(MTaskEdge::cast(edgep));
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(neighbors.find(edgep->top()) == neighbors.end(), itp,
|
|
|
|
|
"Redundant edge found in input to PartContraction()");
|
2018-07-23 02:54:28 +02:00
|
|
|
neighbors.insert(edgep->top());
|
|
|
|
|
}
|
|
|
|
|
siblingPairFromRelatives(GraphWay::REVERSE, itp, true);
|
|
|
|
|
siblingPairFromRelatives(GraphWay::FORWARD, itp, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
doRescore(); // Set initial scores in scoreboard
|
|
|
|
|
|
2020-04-04 04:31:54 +02:00
|
|
|
while (true) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// This is the best edge to merge, with the lowest
|
|
|
|
|
// score (shortest local critical path)
|
|
|
|
|
MergeCandidate* mergeCanp = const_cast<MergeCandidate*>(m_sb.bestp());
|
|
|
|
|
if (!mergeCanp) {
|
|
|
|
|
// Scoreboard found no eligible merges. Maybe a rescore
|
|
|
|
|
// will produce some merge-able pairs?
|
|
|
|
|
if (m_sb.needsRescore()) {
|
|
|
|
|
doRescore();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (m_slowAsserts) {
|
|
|
|
|
UASSERT(!m_sb.needsRescore(mergeCanp),
|
|
|
|
|
"Need-rescore items should not be returned by bestp");
|
|
|
|
|
}
|
|
|
|
|
uint32_t cachedScore = m_sb.cachedScore(mergeCanp);
|
|
|
|
|
uint32_t actualScore = mergeCandidateScore(mergeCanp);
|
|
|
|
|
|
|
|
|
|
if (actualScore > cachedScore) {
|
|
|
|
|
// Cached score is out-of-date.
|
|
|
|
|
// Mark this elem as in need of a rescore and continue.
|
|
|
|
|
m_sb.hintScoreChanged(mergeCanp);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// ... we'll also confirm that actualScore hasn't shrunk relative
|
|
|
|
|
// to cached score, after the mergeWouldCreateCycle() check.
|
|
|
|
|
|
|
|
|
|
if (actualScore > m_scoreLimit) {
|
|
|
|
|
// Our best option isn't good enough
|
|
|
|
|
if (m_sb.needsRescore()) {
|
|
|
|
|
// Some pairs need a rescore, maybe those will be
|
|
|
|
|
// eligible to merge afterward.
|
|
|
|
|
doRescore();
|
|
|
|
|
continue;
|
|
|
|
|
} else {
|
|
|
|
|
// We've exhausted everything below m_scoreLimit; stop.
|
|
|
|
|
|
|
|
|
|
// Except, if we have too many mtasks, raise the score
|
|
|
|
|
// limit and keep going...
|
|
|
|
|
unsigned mtaskCount = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp;
|
|
|
|
|
vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
++mtaskCount;
|
|
|
|
|
}
|
|
|
|
|
if (mtaskCount > maxMTasks) {
|
|
|
|
|
uint32_t oldLimit = m_scoreLimit;
|
|
|
|
|
m_scoreLimit = (m_scoreLimit * 120) / 100;
|
2018-11-17 02:48:57 +01:00
|
|
|
v3Global.rootp()->fileline()->v3warn(
|
2020-04-15 13:58:34 +02:00
|
|
|
UNOPTTHREADS, "Thread scheduler is unable to provide requested "
|
2020-04-29 03:15:27 +02:00
|
|
|
"parallelism; suggest asking for fewer threads.");
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(1, "Critical path limit was=" << oldLimit << " now=" << m_scoreLimit
|
|
|
|
|
<< endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// Really stop
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (actualScore > m_scoreLimitBeforeRescore) {
|
|
|
|
|
// Time to rescore, that will result in a higher
|
|
|
|
|
// scoreLimitBeforeRescore, and possibly lower-scoring
|
|
|
|
|
// elements returned from bestp().
|
|
|
|
|
doRescore();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Avoid merging any edge that would create a cycle.
|
|
|
|
|
//
|
|
|
|
|
// For example suppose we begin with vertices A, B, C and edges
|
|
|
|
|
// A->B, B->C, A->C.
|
|
|
|
|
//
|
|
|
|
|
// Suppose we want to merge A->C into a single vertex.
|
|
|
|
|
// New edges would be AC->B and B->AC which is not a DAG.
|
|
|
|
|
// Do not allow this.
|
|
|
|
|
if (mergeCanp->mergeWouldCreateCycle()) {
|
|
|
|
|
// Remove this edge from scoreboard so we don't keep
|
|
|
|
|
// reconsidering it on every loop.
|
|
|
|
|
m_sb.removeElem(mergeCanp);
|
|
|
|
|
mergeCanp->removedFromSb(true);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
partCheckCachedScoreVsActual(cachedScore, actualScore);
|
|
|
|
|
|
|
|
|
|
// Finally there's no cycle risk, no need to rescore, we're
|
|
|
|
|
// within m_scoreLimit and m_scoreLimitBeforeRescore.
|
|
|
|
|
// This is the edge to merge.
|
|
|
|
|
//
|
|
|
|
|
// Bookkeeping: if this is the first edge we'll merge since
|
|
|
|
|
// the last rescore, compute the new m_scoreLimitBeforeRescore
|
|
|
|
|
// to be somewhat higher than this edge's score.
|
|
|
|
|
if (m_mergesSinceRescore == 0) {
|
|
|
|
|
#if PART_STEPPED_RESCORELIMIT
|
|
|
|
|
m_scoreLimitBeforeRescore = (actualScore * 105) / 100;
|
|
|
|
|
#else
|
|
|
|
|
m_scoreLimitBeforeRescore = actualScore;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// This print can serve as a progress indicator, as it
|
|
|
|
|
// increases from low numbers up toward cpLimit. It may be
|
|
|
|
|
// helpful to see progress during slow partitions. Maybe
|
|
|
|
|
// display something by default even?
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(6, "New scoreLimitBeforeRescore: " << m_scoreLimitBeforeRescore << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Finally merge this candidate.
|
|
|
|
|
contract(mergeCanp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2020-04-15 13:58:34 +02:00
|
|
|
NewCp newCp(GraphWay way, LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Return new wayward-CP for mtaskp reflecting its upcoming merge
|
|
|
|
|
// with otherp. Set 'result.propagate' if mtaskp's wayward
|
|
|
|
|
// relatives will see a new wayward CP from this merge.
|
|
|
|
|
uint32_t newCp;
|
|
|
|
|
if (mergeEdgep) {
|
|
|
|
|
if (mtaskp == mergeEdgep->furtherp(way)) {
|
|
|
|
|
newCp = std::max(otherp->critPathCost(way),
|
|
|
|
|
mtaskp->critPathCostWithout(way, mergeEdgep));
|
|
|
|
|
} else {
|
|
|
|
|
newCp = std::max(mtaskp->critPathCost(way),
|
|
|
|
|
otherp->critPathCostWithout(way, mergeEdgep));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way));
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost();
|
|
|
|
|
uint32_t newRelativesCp = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost());
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
NewCp result;
|
|
|
|
|
result.cp = newCp;
|
|
|
|
|
result.propagate = (newRelativesCp > origRelativesCp);
|
|
|
|
|
result.propagateCp = newRelativesCp;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void removeSiblingMCsWith(LogicMTask* mtaskp) {
|
2020-04-15 13:58:34 +02:00
|
|
|
for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin(); it != m_mtask2sibs[mtaskp].end();
|
|
|
|
|
++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const SiblingMC* pairp = *it;
|
2021-02-22 03:25:21 +01:00
|
|
|
if (!pairp->removedFromSb()) m_sb.removeElem(pairp);
|
2020-04-15 13:58:34 +02:00
|
|
|
LogicMTask* otherp = (pairp->bp() == mtaskp) ? pairp->ap() : pairp->bp();
|
2018-07-23 02:54:28 +02:00
|
|
|
size_t erased = m_mtask2sibs[otherp].erase(pairp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(erased > 0, otherp, "Expected existing mtask");
|
2018-07-23 02:54:28 +02:00
|
|
|
erased = m_pairs.erase(*pairp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2021-06-21 00:32:57 +02:00
|
|
|
const size_t erased = m_mtask2sibs.erase(mtaskp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(erased > 0, mtaskp, "Expected existing mtask");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void contract(MergeCandidate* mergeCanp) {
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* top = nullptr;
|
|
|
|
|
LogicMTask* fromp = nullptr;
|
2018-07-23 02:54:28 +02:00
|
|
|
MTaskEdge* mergeEdgep = dynamic_cast<MTaskEdge*>(mergeCanp);
|
2020-08-15 16:12:55 +02:00
|
|
|
SiblingMC* mergeSibsp = nullptr;
|
2018-07-23 02:54:28 +02:00
|
|
|
if (mergeEdgep) {
|
|
|
|
|
top = dynamic_cast<LogicMTask*>(mergeEdgep->top());
|
|
|
|
|
fromp = dynamic_cast<LogicMTask*>(mergeEdgep->fromp());
|
|
|
|
|
} else {
|
|
|
|
|
mergeSibsp = dynamic_cast<SiblingMC*>(mergeCanp);
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT(mergeSibsp, "Failed to cast mergeCanp to either MTaskEdge or SiblingMC");
|
2018-07-23 02:54:28 +02:00
|
|
|
top = mergeSibsp->ap();
|
|
|
|
|
fromp = mergeSibsp->bp();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Merge the smaller mtask into the larger mtask. If one of them
|
|
|
|
|
// is much larger, this will save time in partMergeEdgesFrom().
|
|
|
|
|
// Assume the more costly mtask has more edges.
|
|
|
|
|
//
|
|
|
|
|
// [TODO: now that we have edge maps, we could count the edges
|
|
|
|
|
// exactly without a linear search.]
|
|
|
|
|
LogicMTask* recipientp;
|
|
|
|
|
LogicMTask* donorp;
|
|
|
|
|
if (fromp->cost() > top->cost()) {
|
|
|
|
|
recipientp = fromp;
|
|
|
|
|
donorp = top;
|
|
|
|
|
} else {
|
|
|
|
|
donorp = fromp;
|
|
|
|
|
recipientp = top;
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
VL_DANGLING(fromp);
|
|
|
|
|
VL_DANGLING(top); // Use donorp and recipientp now instead
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Recursively update forward and reverse CP numbers.
|
|
|
|
|
//
|
|
|
|
|
// Doing this before merging the mtasks lets us often avoid
|
|
|
|
|
// recursing through either incoming or outgoing edges on one or
|
|
|
|
|
// both mtasks.
|
|
|
|
|
//
|
|
|
|
|
// These 'NewCp' objects carry a bit indicating whether we must
|
|
|
|
|
// propagate CP for each of the four cases:
|
2021-06-21 00:32:57 +02:00
|
|
|
const NewCp recipientNewCpFwd = newCp(GraphWay::FORWARD, recipientp, donorp, mergeEdgep);
|
|
|
|
|
const NewCp donorNewCpFwd = newCp(GraphWay::FORWARD, donorp, recipientp, mergeEdgep);
|
|
|
|
|
const NewCp recipientNewCpRev = newCp(GraphWay::REVERSE, recipientp, donorp, mergeEdgep);
|
|
|
|
|
const NewCp donorNewCpRev = newCp(GraphWay::REVERSE, donorp, recipientp, mergeEdgep);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
if (mergeEdgep) {
|
|
|
|
|
// Remove and free the connecting edge. Must do this before
|
|
|
|
|
// propagating CP's below.
|
|
|
|
|
m_sb.removeElem(mergeCanp);
|
2020-08-15 16:12:55 +02:00
|
|
|
VL_DO_CLEAR(mergeEdgep->unlinkDelete(), mergeEdgep = nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This also updates cost and stepCost on recipientp
|
|
|
|
|
recipientp->moveAllVerticesFrom(donorp);
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(9, "recipient = " << recipientp->id() << ", donor = " << donorp->id()
|
|
|
|
|
<< ", mergeEdgep = " << mergeEdgep << "\n"
|
|
|
|
|
<< "recipientNewCpFwd = " << recipientNewCpFwd.cp
|
|
|
|
|
<< (recipientNewCpFwd.propagate ? " true " : " false ")
|
|
|
|
|
<< recipientNewCpFwd.propagateCp << "\n"
|
|
|
|
|
<< "donorNewCpFwd = " << donorNewCpFwd.cp
|
|
|
|
|
<< (donorNewCpFwd.propagate ? " true " : " false ")
|
|
|
|
|
<< donorNewCpFwd.propagateCp << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
LogicMTask::CpCostAccessor cpAccess;
|
2020-04-15 13:58:34 +02:00
|
|
|
PartPropagateCp<LogicMTask::CpCostAccessor> forwardPropagator(m_mtasksp, GraphWay::FORWARD,
|
|
|
|
|
&cpAccess, m_slowAsserts);
|
|
|
|
|
PartPropagateCp<LogicMTask::CpCostAccessor> reversePropagator(m_mtasksp, GraphWay::REVERSE,
|
|
|
|
|
&cpAccess, m_slowAsserts);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp);
|
2018-07-23 02:54:28 +02:00
|
|
|
if (recipientNewCpFwd.propagate) {
|
|
|
|
|
forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp);
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp);
|
2018-07-23 02:54:28 +02:00
|
|
|
if (recipientNewCpRev.propagate) {
|
|
|
|
|
reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp);
|
|
|
|
|
}
|
|
|
|
|
if (donorNewCpFwd.propagate) {
|
|
|
|
|
forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp);
|
|
|
|
|
}
|
|
|
|
|
if (donorNewCpRev.propagate) {
|
|
|
|
|
reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp);
|
|
|
|
|
}
|
|
|
|
|
forwardPropagator.go();
|
|
|
|
|
reversePropagator.go();
|
|
|
|
|
|
|
|
|
|
// Remove all SiblingMCs that include donorp. This Includes the one
|
|
|
|
|
// we're merging, if we're merging a SiblingMC.
|
|
|
|
|
removeSiblingMCsWith(donorp);
|
|
|
|
|
// Remove all SiblingMCs that include recipientp also, so we can't
|
|
|
|
|
// get huge numbers of SiblingMCs. We'll recreate them below, up
|
|
|
|
|
// to a bounded number.
|
|
|
|
|
removeSiblingMCsWith(recipientp);
|
|
|
|
|
|
|
|
|
|
// Merge all edges
|
|
|
|
|
partMergeEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb);
|
|
|
|
|
|
|
|
|
|
// Delete the donorp mtask from the graph
|
2020-08-15 16:12:55 +02:00
|
|
|
VL_DO_CLEAR(donorp->unlinkDelete(m_mtasksp), donorp = nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
m_mergesSinceRescore++;
|
|
|
|
|
|
|
|
|
|
// Do an expensive check, confirm we haven't botched the CP
|
|
|
|
|
// updates.
|
|
|
|
|
if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp);
|
|
|
|
|
|
|
|
|
|
// Finally, make new sibling pairs as needed:
|
|
|
|
|
// - prereqs and postreqs of recipientp
|
|
|
|
|
// - prereqs of recipientp's postreqs
|
|
|
|
|
// - postreqs of recipientp's prereqs
|
|
|
|
|
// Note that this depends on the updated critical paths (above).
|
|
|
|
|
siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true);
|
|
|
|
|
siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true);
|
|
|
|
|
unsigned edges = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* postreqp = dynamic_cast<LogicMTask*>(edgep->top());
|
|
|
|
|
siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false);
|
|
|
|
|
edges++;
|
|
|
|
|
if (edges > PART_SIBLING_EDGE_LIMIT) break;
|
|
|
|
|
}
|
|
|
|
|
edges = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* prereqp = dynamic_cast<LogicMTask*>(edgep->fromp());
|
|
|
|
|
siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false);
|
|
|
|
|
edges++;
|
|
|
|
|
if (edges > PART_SIBLING_EDGE_LIMIT) break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void doRescore() {
|
|
|
|
|
// During rescore, we know that graph isn't changing, so allow
|
|
|
|
|
// the critPathCost*Without() routines to cache some data in
|
|
|
|
|
// each LogicMTask. This is just an optimization, things should
|
|
|
|
|
// behave identically without the caching (just slower)
|
|
|
|
|
|
|
|
|
|
m_sb.rescore();
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(6, "Did rescore. Merges since previous = " << m_mergesSinceRescore << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
m_mergesSinceRescore = 0;
|
|
|
|
|
m_scoreLimitBeforeRescore = 0xffffffff;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t mergeCandidateScore(const MergeCandidate* pairp) {
|
|
|
|
|
const MTaskEdge* edgep = dynamic_cast<const MTaskEdge*>(pairp);
|
|
|
|
|
if (edgep) {
|
|
|
|
|
// The '1 +' favors merging a SiblingMC over an otherwise-
|
|
|
|
|
// equal-scoring MTaskEdge. The comment on selfTest() talks
|
|
|
|
|
// about why.
|
|
|
|
|
return 1 + edgeScore(edgep);
|
|
|
|
|
}
|
|
|
|
|
const SiblingMC* sibsp = dynamic_cast<const SiblingMC*>(pairp);
|
2021-02-22 03:25:21 +01:00
|
|
|
if (sibsp) return siblingScore(sibsp);
|
2018-07-23 02:54:28 +02:00
|
|
|
v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t siblingScore(const SiblingMC* sibsp) {
|
|
|
|
|
LogicMTask* ap = sibsp->ap();
|
|
|
|
|
LogicMTask* bp = sibsp->bp();
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t mergedCpCostFwd
|
|
|
|
|
= std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD));
|
|
|
|
|
uint32_t mergedCpCostRev
|
|
|
|
|
= std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE));
|
|
|
|
|
return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost());
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t edgeScore(const V3GraphEdge* edgep) {
|
|
|
|
|
// Score this edge. Lower is better. The score is the new local CP
|
|
|
|
|
// length if we merge these mtasks. ("Local" means the longest
|
|
|
|
|
// critical path running through the merged node.)
|
|
|
|
|
LogicMTask* top = dynamic_cast<LogicMTask*>(edgep->top());
|
|
|
|
|
LogicMTask* fromp = dynamic_cast<LogicMTask*>(edgep->fromp());
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD),
|
|
|
|
|
top->critPathCostWithout(GraphWay::FORWARD, edgep));
|
|
|
|
|
uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep),
|
|
|
|
|
top->critPathCost(GraphWay::REVERSE));
|
2018-07-23 02:54:28 +02:00
|
|
|
return mergedCpCostRev + mergedCpCostFwd
|
2020-04-15 13:58:34 +02:00
|
|
|
+ LogicMTask::stepCost(fromp->cost() + top->cost());
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) {
|
2018-07-23 02:54:28 +02:00
|
|
|
SiblingMC newSibs(ap, bp);
|
|
|
|
|
std::pair<SibSet::iterator, bool> insertResult = m_pairs.insert(newSibs);
|
|
|
|
|
if (insertResult.second) {
|
|
|
|
|
const SiblingMC* newSibsp = &(*insertResult.first);
|
|
|
|
|
m_mtask2sibs[ap].insert(newSibsp);
|
|
|
|
|
m_mtask2sibs[bp].insert(newSibsp);
|
|
|
|
|
m_sb.addElem(newSibsp);
|
|
|
|
|
} else if (m_slowAsserts) {
|
|
|
|
|
// It's fine if we already have this SiblingMC, we may have
|
|
|
|
|
// created it earlier. Just confirm that we have associated data.
|
2020-04-15 13:58:34 +02:00
|
|
|
UASSERT_OBJ(m_mtask2sibs.find(ap) != m_mtask2sibs.end(), ap, "Sibling not found");
|
|
|
|
|
UASSERT_OBJ(m_mtask2sibs.find(bp) != m_mtask2sibs.end(), bp, "Sibling not found");
|
2018-07-23 02:54:28 +02:00
|
|
|
bool found = false;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (SibpSet::iterator it = m_mtask2sibs[ap].begin(); it != m_mtask2sibs[ap].end();
|
|
|
|
|
++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const SiblingMC* sibsp = *it;
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(!(!sibsp->removedFromSb() && !m_sb.contains(sibsp)), ap,
|
|
|
|
|
"One sibling must be the one we collided with");
|
2020-04-15 13:58:34 +02:00
|
|
|
if ((sibsp->ap() == ap && sibsp->bp() == bp)
|
2018-07-23 02:54:28 +02:00
|
|
|
|| (sibsp->bp() == ap && sibsp->ap() == bp))
|
|
|
|
|
found = true;
|
|
|
|
|
}
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(found, ap, "Sibling not found");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2020-02-04 04:10:29 +01:00
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
static const GraphWay* s_shortestWaywardCpInclusiveWay;
|
|
|
|
|
static int shortestWaywardCpInclusive(const void* vap, const void* vbp) {
|
|
|
|
|
const GraphWay* wp = s_shortestWaywardCpInclusiveWay;
|
2020-04-15 13:58:34 +02:00
|
|
|
const LogicMTask* ap = *reinterpret_cast<const LogicMTask* const*>(vap);
|
|
|
|
|
const LogicMTask* bp = *reinterpret_cast<const LogicMTask* const*>(vbp);
|
2018-07-23 02:54:28 +02:00
|
|
|
uint32_t aCp = ap->critPathCost(*wp) + ap->stepCost();
|
|
|
|
|
uint32_t bCp = bp->critPathCost(*wp) + bp->stepCost();
|
2021-02-22 03:25:21 +01:00
|
|
|
if (aCp < bCp) return -1;
|
|
|
|
|
if (aCp > bCp) return 1;
|
|
|
|
|
if (ap->id() < bp->id()) return -1;
|
|
|
|
|
if (ap->id() > bp->id()) return 1;
|
2018-07-23 02:54:28 +02:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp, bool exhaustive) {
|
2018-07-23 02:54:28 +02:00
|
|
|
std::vector<LogicMTask*> shortestPrereqs;
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = mtaskp->beginp(way); edgep; edgep = edgep->nextp(way)) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* prereqp = dynamic_cast<LogicMTask*>(edgep->furtherp(way));
|
|
|
|
|
shortestPrereqs.push_back(prereqp);
|
|
|
|
|
// Prevent nodes with huge numbers of edges from massively
|
|
|
|
|
// slowing down the partitioner:
|
|
|
|
|
if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shortestPrereqs.empty()) return;
|
|
|
|
|
|
|
|
|
|
// qsort_r would be nice here, but it isn't portable
|
|
|
|
|
s_shortestWaywardCpInclusiveWay = &way;
|
2020-04-15 13:58:34 +02:00
|
|
|
qsort(&shortestPrereqs[0], shortestPrereqs.size(), sizeof(LogicMTask*),
|
|
|
|
|
&shortestWaywardCpInclusive);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Don't make all NxN/2 possible pairs of prereqs, that's a lot
|
|
|
|
|
// to cart around. Just make a few pairs.
|
2020-08-16 17:43:49 +02:00
|
|
|
auto it = shortestPrereqs.cbegin();
|
2018-07-23 02:54:28 +02:00
|
|
|
for (unsigned i = 0; exhaustive || (i < 3); ++i) {
|
2020-08-28 00:48:26 +02:00
|
|
|
if (it == shortestPrereqs.cend()) break;
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* ap = *(it++);
|
2020-08-28 00:48:26 +02:00
|
|
|
if (it == shortestPrereqs.cend()) break;
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* bp = *(it++);
|
|
|
|
|
makeSiblingMC(ap, bp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SELF TESTS
|
|
|
|
|
|
|
|
|
|
// This is a performance test, its intent is to demonstrate that the
|
|
|
|
|
// partitioner doesn't run on this chain in N^2 time or worse. Overall
|
|
|
|
|
// runtime should be N*log(N) for a chain-shaped graph.
|
|
|
|
|
//
|
|
|
|
|
static void selfTestChain() {
|
|
|
|
|
vluint64_t usecsSmall = partitionChainUsecs(5);
|
|
|
|
|
vluint64_t usecsLarge = partitionChainUsecs(500);
|
|
|
|
|
// Large input is 50x bigger than small input.
|
|
|
|
|
// Its runtime should be about 10x longer -- not about 2500x longer
|
|
|
|
|
// or worse which would suggest N^2 scaling or worse.
|
|
|
|
|
UASSERT(usecsLarge < (usecsSmall * 1500),
|
|
|
|
|
"selfTestChain() took longer than expected. Small input runtime = "
|
2020-04-15 13:58:34 +02:00
|
|
|
<< usecsSmall << ", large input runtime = " << usecsLarge);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static vluint64_t partitionChainUsecs(unsigned chain_len) {
|
|
|
|
|
// NOTE: To get a dot file run with --debugi-V3Partition 4 or more.
|
|
|
|
|
vluint64_t startUsecs = V3Os::timeUsecs();
|
|
|
|
|
V3Graph mtasks;
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* lastp = nullptr;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (unsigned i = 0; i < chain_len; ++i) {
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* mtp = new LogicMTask(&mtasks, nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
mtp->setCost(1);
|
2021-02-22 03:25:21 +01:00
|
|
|
if (lastp) new MTaskEdge(&mtasks, lastp, mtp, 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
lastp = mtp;
|
|
|
|
|
}
|
|
|
|
|
partInitCriticalPaths(&mtasks);
|
|
|
|
|
|
|
|
|
|
// Since slowAsserts mode is *expected* to cause N^2 runtime, and the
|
|
|
|
|
// intent of this test is to demonstrate better-than-N^2 runtime, disable
|
|
|
|
|
// slowAsserts.
|
|
|
|
|
PartContraction ec(&mtasks,
|
|
|
|
|
// Any CP limit >chain_len should work:
|
2020-04-15 13:58:34 +02:00
|
|
|
chain_len * 2, false /* slowAsserts */);
|
2018-07-23 02:54:28 +02:00
|
|
|
ec.go();
|
|
|
|
|
|
|
|
|
|
PartParallelismEst check(&mtasks);
|
|
|
|
|
check.traverse();
|
|
|
|
|
|
|
|
|
|
vluint64_t endUsecs = V3Os::timeUsecs();
|
|
|
|
|
vluint64_t elapsedUsecs = endUsecs - startUsecs;
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
if (debug() >= 6) {
|
2018-07-23 02:54:28 +02:00
|
|
|
UINFO(0, "Chain self test stats:\n");
|
|
|
|
|
check.debugReport();
|
|
|
|
|
UINFO(0, "Elapsed usecs = " << elapsedUsecs << "\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// All vertices should merge into one
|
|
|
|
|
UASSERT_SELFTEST(size_t, check.vertexCount(), 1);
|
|
|
|
|
return elapsedUsecs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// This test defends against a particular failure mode that the
|
|
|
|
|
// partitioner exhibited during development:
|
|
|
|
|
//
|
|
|
|
|
// At one time, the partitioner consistently favored edge-merges over
|
|
|
|
|
// equal-scoring sibling merges. Every edge and sibling merge in this
|
|
|
|
|
// test starts out with an equal score. If you only do edge-merges, all
|
|
|
|
|
// possible merges will continue to have equal score as the center node
|
|
|
|
|
// grows and grows. Soon the critical path budget is exhausted by a
|
|
|
|
|
// large center node, and we still have many small leaf nodes -- it's
|
|
|
|
|
// literally the worst partition possible.
|
|
|
|
|
//
|
|
|
|
|
// Now, instead, the partitioner gives slight favoritism to sibling
|
|
|
|
|
// merges in the event that scores are tied. This is better for the
|
|
|
|
|
// test and also real designs.
|
|
|
|
|
static void selfTestX() {
|
|
|
|
|
// NOTE: To get a dot file run with --debugi-V3Partition 4 or more.
|
|
|
|
|
V3Graph mtasks;
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* center = new LogicMTask(&mtasks, nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
center->setCost(1);
|
|
|
|
|
unsigned i;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (i = 0; i < 50; ++i) {
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* mtp = new LogicMTask(&mtasks, nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
mtp->setCost(1);
|
|
|
|
|
// Edge from every input -> center
|
|
|
|
|
new MTaskEdge(&mtasks, mtp, center, 1);
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
for (i = 0; i < 50; ++i) {
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* mtp = new LogicMTask(&mtasks, nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
mtp->setCost(1);
|
|
|
|
|
// Edge from center -> every output
|
|
|
|
|
new MTaskEdge(&mtasks, center, mtp, 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
partInitCriticalPaths(&mtasks);
|
|
|
|
|
PartContraction(&mtasks, 20, true).go();
|
|
|
|
|
|
|
|
|
|
PartParallelismEst check(&mtasks);
|
|
|
|
|
check.traverse();
|
|
|
|
|
|
|
|
|
|
// Checking exact values here is maybe overly precise. What we're
|
|
|
|
|
// mostly looking for is a healthy reduction in the number of
|
|
|
|
|
// mtasks.
|
2020-04-15 13:58:34 +02:00
|
|
|
if (debug() >= 5) {
|
2018-07-23 02:54:28 +02:00
|
|
|
UINFO(0, "X self test stats:\n");
|
|
|
|
|
check.debugReport();
|
|
|
|
|
}
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, check.longestCritPathCost(), 19);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, check.totalGraphCost(), 101);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, check.vertexCount(), 14);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, check.edgeCount(), 13);
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
|
|
|
|
static void selfTest() {
|
|
|
|
|
selfTestX();
|
|
|
|
|
selfTestChain();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_DEBUG_FUNC; // Declare debug()
|
|
|
|
|
VL_UNCOPYABLE(PartContraction);
|
|
|
|
|
};
|
|
|
|
|
|
2020-08-15 16:12:55 +02:00
|
|
|
const GraphWay* PartContraction::s_shortestWaywardCpInclusiveWay = nullptr;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// DpiImportCallVisitor
|
|
|
|
|
|
|
|
|
|
// Scan node, indicate whether it contains a call to a DPI imported
|
|
|
|
|
// routine.
|
2020-11-19 03:32:16 +01:00
|
|
|
class DpiImportCallVisitor final : public AstNVisitor {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
2020-08-15 19:11:27 +02:00
|
|
|
bool m_hasDpiHazard = false; // Found a DPI import call.
|
|
|
|
|
bool m_tracingCall = false; // Iterating into a CCall to a CFunc
|
2018-07-23 02:54:28 +02:00
|
|
|
// METHODS
|
|
|
|
|
VL_DEBUG_FUNC;
|
|
|
|
|
|
2020-08-15 16:03:34 +02:00
|
|
|
virtual void visit(AstCFunc* nodep) override {
|
2018-07-23 02:54:28 +02:00
|
|
|
if (!m_tracingCall) return;
|
|
|
|
|
m_tracingCall = false;
|
|
|
|
|
if (nodep->dpiImportWrapper()) {
|
|
|
|
|
if (nodep->pure() ? !v3Global.opt.threadsDpiPure()
|
2020-04-15 13:58:34 +02:00
|
|
|
: !v3Global.opt.threadsDpiUnpure()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
m_hasDpiHazard = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
iterateChildren(nodep);
|
|
|
|
|
}
|
2020-08-15 16:03:34 +02:00
|
|
|
virtual void visit(AstNodeCCall* nodep) override {
|
2018-07-23 02:54:28 +02:00
|
|
|
iterateChildren(nodep);
|
|
|
|
|
// Enter the function and trace it
|
|
|
|
|
m_tracingCall = true;
|
|
|
|
|
iterate(nodep->funcp());
|
|
|
|
|
}
|
2020-08-15 16:03:34 +02:00
|
|
|
virtual void visit(AstNode* nodep) override { iterateChildren(nodep); }
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
public:
|
2019-09-12 13:22:22 +02:00
|
|
|
// CONSTRUCTORS
|
2020-08-15 19:11:27 +02:00
|
|
|
explicit DpiImportCallVisitor(AstNode* nodep) { iterate(nodep); }
|
2018-07-23 02:54:28 +02:00
|
|
|
bool hasDpiHazard() const { return m_hasDpiHazard; }
|
2020-11-17 01:56:16 +01:00
|
|
|
virtual ~DpiImportCallVisitor() override = default;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_UNCOPYABLE(DpiImportCallVisitor);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// PartFixDataHazards
|
|
|
|
|
|
|
|
|
|
// Fix data hazards in the partition graph.
|
|
|
|
|
//
|
|
|
|
|
// The fine-grained graph from V3Order may contain data hazards which are
|
|
|
|
|
// not a problem for serial mode, but which would be a problem in parallel
|
|
|
|
|
// mode.
|
|
|
|
|
//
|
|
|
|
|
// There are basically two classes: unordered pairs of writes, and
|
|
|
|
|
// unordered write-read pairs. We fix both here, with a combination of
|
|
|
|
|
// MTask-merges and new edges to ensure no such unordered pairs remain.
|
|
|
|
|
//
|
|
|
|
|
// ABOUT UNORDERED WRITE-WRITE PAIRS
|
|
|
|
|
//
|
|
|
|
|
// The V3Order dependency graph treats these as unordered events:
|
|
|
|
|
//
|
|
|
|
|
// a) sig[15:8] = stuff;
|
|
|
|
|
// ...
|
|
|
|
|
// b) sig[7:0] = other_stuff;
|
|
|
|
|
//
|
|
|
|
|
// Seems OK right? They are writes to disjoint bits of the same
|
|
|
|
|
// signal. They can run in either order, in serial mode, and the result
|
|
|
|
|
// will be the same.
|
|
|
|
|
//
|
|
|
|
|
// The resulting C code for each of this isn't a pure write, it's
|
|
|
|
|
// actually an R-M-W sequence:
|
|
|
|
|
//
|
|
|
|
|
// a) sig = (sig & 0xff) | (0xff00 & (stuff << 8));
|
|
|
|
|
// ...
|
|
|
|
|
// b) sig = (sig & 0xff00) | (0xff & other_stuff);
|
|
|
|
|
//
|
|
|
|
|
// In serial mode, order doesn't matter so long as these run serially.
|
|
|
|
|
// In parallel mode, we must serialize these RMW's to avoid a race.
|
|
|
|
|
//
|
|
|
|
|
// We don't actually check here if each write would involve an R-M-W, we
|
|
|
|
|
// just assume that it would. If this routine ever causes a drastic
|
|
|
|
|
// increase in critical path, it could be optimized to make a better
|
|
|
|
|
// prediction (with all the risk that word implies!) about whether a
|
|
|
|
|
// given write is likely to turn into an R-M-W.
|
|
|
|
|
//
|
|
|
|
|
// ABOUT UNORDERED WRITE-READ PAIRS
|
|
|
|
|
//
|
|
|
|
|
// If we don't put unordered write-read pairs into some order at verilation
|
|
|
|
|
// time, we risk a runtime race.
|
|
|
|
|
//
|
|
|
|
|
// How do such unordered writer/reader pairs happen? Here's a partial list
|
|
|
|
|
// of scenarios:
|
|
|
|
|
//
|
|
|
|
|
// Case 1: Circular logic
|
|
|
|
|
//
|
|
|
|
|
// If the design has circular logic, V3Order has by now generated some
|
|
|
|
|
// dependency cycles, and also cut some of the edges to make it
|
|
|
|
|
// acyclic.
|
|
|
|
|
//
|
|
|
|
|
// For serial mode, that was fine. We can break logic circles at an
|
|
|
|
|
// arbitrary point. At runtime, we'll repeat the _eval() until no
|
|
|
|
|
// changes are detected, which papers over the discarded dependency.
|
|
|
|
|
//
|
|
|
|
|
// For parallel mode, this situation can lead to unordered reads and
|
|
|
|
|
// writes of the same variable, causing a data race. For example if the
|
|
|
|
|
// original code is this:
|
|
|
|
|
//
|
|
|
|
|
// assign b = b | a << 2;
|
|
|
|
|
// assign out = b;
|
|
|
|
|
//
|
|
|
|
|
// ... there's originally a dependency edge which records that 'b'
|
|
|
|
|
// depends on the first assign. V3Order may cut this edge, making the
|
|
|
|
|
// statements unordered. In serial mode that's fine, they can run in
|
|
|
|
|
// either order. In parallel mode it's a reader/writer race.
|
|
|
|
|
//
|
|
|
|
|
// Case 2: Race Condition in Verilog Sources
|
|
|
|
|
//
|
|
|
|
|
// If the input has races, eg. blocking assignments in always blocks
|
|
|
|
|
// that share variables, the graph at this point will contain unordered
|
|
|
|
|
// writes and reads (or unordered write-write pairs) reflecting that.
|
|
|
|
|
//
|
|
|
|
|
// Case 3: Interesting V3Order Behavior
|
|
|
|
|
//
|
|
|
|
|
// There's code in V3Order that explicitly avoids making a dependency
|
|
|
|
|
// edge from a clock-gater signal to the logic node that produces the
|
|
|
|
|
// clock signal. This leads to unordered reader/writer pairs in
|
|
|
|
|
// parallel mode.
|
|
|
|
|
//
|
2020-11-19 03:32:16 +01:00
|
|
|
class PartFixDataHazards final {
|
2018-07-23 02:54:28 +02:00
|
|
|
private:
|
|
|
|
|
// TYPES
|
2021-03-13 00:10:45 +01:00
|
|
|
using LogicMTaskSet = std::set<LogicMTask*, MTaskIdLessThan>;
|
|
|
|
|
using TasksByRank = std::map<uint32_t /*rank*/, LogicMTaskSet>;
|
|
|
|
|
using OvvSet = std::set<const OrderVarStdVertex*, OrderByPtrId&>;
|
|
|
|
|
using Olv2MTaskMap = std::unordered_map<const OrderLogicVertex*, LogicMTask*>;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// MEMBERS
|
|
|
|
|
V3Graph* m_mtasksp; // Mtask graph
|
|
|
|
|
Olv2MTaskMap m_olv2mtask; // Map OrderLogicVertex to LogicMTask who wraps it
|
2020-08-16 15:55:36 +02:00
|
|
|
unsigned m_mergesDone = 0; // Number of MTasks merged. For stats only.
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORs
|
|
|
|
|
explicit PartFixDataHazards(V3Graph* mtasksp)
|
2020-08-16 15:55:36 +02:00
|
|
|
: m_mtasksp{mtasksp} {}
|
2018-07-23 02:54:28 +02:00
|
|
|
// METHODS
|
|
|
|
|
private:
|
|
|
|
|
void findAdjacentTasks(OvvSet::iterator ovvIt, TasksByRank* tasksByRankp) {
|
|
|
|
|
// Find all writer tasks for this variable, group by rank.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = (*ovvIt)->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
OrderLogicVertex* logicp = dynamic_cast<OrderLogicVertex*>(edgep->fromp());
|
|
|
|
|
if (!logicp) continue;
|
2020-04-15 13:58:34 +02:00
|
|
|
if (logicp->domainp()->hasInitial() || logicp->domainp()->hasSettle()) continue;
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* writerMtaskp = m_olv2mtask.at(logicp);
|
|
|
|
|
(*tasksByRankp)[writerMtaskp->rank()].insert(writerMtaskp);
|
|
|
|
|
}
|
|
|
|
|
// Find all reader tasks for this variable, group by rank.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = (*ovvIt)->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
OrderLogicVertex* logicp = dynamic_cast<OrderLogicVertex*>(edgep->fromp());
|
|
|
|
|
if (!logicp) continue;
|
2020-04-15 13:58:34 +02:00
|
|
|
if (logicp->domainp()->hasInitial() || logicp->domainp()->hasSettle()) continue;
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* readerMtaskp = m_olv2mtask.at(logicp);
|
|
|
|
|
(*tasksByRankp)[readerMtaskp->rank()].insert(readerMtaskp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
void mergeSameRankTasks(TasksByRank* tasksByRankp) {
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* lastMergedp = nullptr;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (TasksByRank::iterator rankIt = tasksByRankp->begin(); rankIt != tasksByRankp->end();
|
|
|
|
|
++rankIt) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Find the largest node at this rank, merge into it. (If we
|
|
|
|
|
// happen to find a huge node, this saves time in
|
|
|
|
|
// partMergeEdgesFrom() versus merging into an arbitrary node.)
|
2020-08-15 16:12:55 +02:00
|
|
|
LogicMTask* mergedp = nullptr;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (LogicMTaskSet::iterator it = rankIt->second.begin(); it != rankIt->second.end();
|
|
|
|
|
++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = *it;
|
|
|
|
|
if (mergedp) {
|
2021-02-22 03:25:21 +01:00
|
|
|
if (mergedp->cost() < mtaskp->cost()) mergedp = mtaskp;
|
2018-07-23 02:54:28 +02:00
|
|
|
} else {
|
|
|
|
|
mergedp = mtaskp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
rankIt->second.erase(mergedp);
|
|
|
|
|
|
|
|
|
|
while (!rankIt->second.empty()) {
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto begin = rankIt->second.cbegin();
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* donorp = *begin;
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(donorp != mergedp, donorp, "Donor can't be merged edge");
|
2018-07-23 02:54:28 +02:00
|
|
|
rankIt->second.erase(begin);
|
|
|
|
|
// Merge donorp into mergedp.
|
|
|
|
|
// Fix up the map, so donor's OLVs map to mergedp
|
2020-04-15 13:58:34 +02:00
|
|
|
for (LogicMTask::VxList::const_iterator tmvit = donorp->vertexListp()->begin();
|
2018-07-23 02:54:28 +02:00
|
|
|
tmvit != donorp->vertexListp()->end(); ++tmvit) {
|
|
|
|
|
MTaskMoveVertex* tmvp = *tmvit;
|
|
|
|
|
OrderLogicVertex* logicp = tmvp->logicp();
|
|
|
|
|
if (logicp) m_olv2mtask[logicp] = mergedp;
|
|
|
|
|
}
|
|
|
|
|
// Move all vertices from donorp to mergedp
|
|
|
|
|
mergedp->moveAllVerticesFrom(donorp);
|
|
|
|
|
// Move edges from donorp to recipientp
|
2020-08-15 16:12:55 +02:00
|
|
|
partMergeEdgesFrom(m_mtasksp, mergedp, donorp, nullptr);
|
2018-07-23 02:54:28 +02:00
|
|
|
// Remove donorp from the graph
|
2020-01-17 02:17:11 +01:00
|
|
|
VL_DO_DANGLING(donorp->unlinkDelete(m_mtasksp), donorp);
|
2018-07-23 02:54:28 +02:00
|
|
|
m_mergesDone++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (lastMergedp) {
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(lastMergedp->rank() < mergedp->rank(), mergedp,
|
|
|
|
|
"Merging must be on lower rank");
|
2018-07-23 02:54:28 +02:00
|
|
|
if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) {
|
|
|
|
|
new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
lastMergedp = mergedp;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
bool hasDpiHazard(LogicMTask* mtaskp) {
|
|
|
|
|
for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin();
|
|
|
|
|
it != mtaskp->vertexListp()->end(); ++it) {
|
|
|
|
|
if (!(*it)->logicp()) continue;
|
|
|
|
|
AstNode* nodep = (*it)->logicp()->nodep();
|
|
|
|
|
// NOTE: We don't handle DPI exports. If testbench code calls a
|
|
|
|
|
// DPI-exported function at any time during eval() we may have
|
|
|
|
|
// a data hazard. (Likewise in non-threaded mode if an export
|
|
|
|
|
// messes with an ordered variable we're broken.)
|
|
|
|
|
|
|
|
|
|
// Find all calls to DPI-imported functions, we can put those
|
|
|
|
|
// into a serial order at least. That should solve the most
|
|
|
|
|
// likely DPI-related data hazards.
|
2020-04-15 13:58:34 +02:00
|
|
|
if (DpiImportCallVisitor(nodep).hasDpiHazard()) { //
|
2018-07-23 02:54:28 +02:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
public:
|
|
|
|
|
void go() {
|
|
|
|
|
vluint64_t startUsecs = 0;
|
|
|
|
|
if (debug() >= 3) startUsecs = V3Os::timeUsecs();
|
|
|
|
|
|
|
|
|
|
// Build an OLV->mtask map and a set of OVVs
|
|
|
|
|
OrderByPtrId ovvOrder;
|
|
|
|
|
OvvSet ovvSet(ovvOrder);
|
|
|
|
|
// OVV's which wrap systemC vars will be handled slightly specially
|
|
|
|
|
OvvSet ovvSetSystemC(ovvOrder);
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
|
|
|
|
|
// Should be only one MTaskMoveVertex in each mtask at this
|
|
|
|
|
// stage, but whatever, write it as a loop:
|
2020-04-15 13:58:34 +02:00
|
|
|
for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin();
|
2018-07-23 02:54:28 +02:00
|
|
|
it != mtaskp->vertexListp()->end(); ++it) {
|
|
|
|
|
MTaskMoveVertex* tmvp = *it;
|
|
|
|
|
if (OrderLogicVertex* logicp = tmvp->logicp()) {
|
|
|
|
|
m_olv2mtask[logicp] = mtaskp;
|
|
|
|
|
// Look at downstream vars.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = logicp->outBeginp(); edgep;
|
|
|
|
|
edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Only consider OrderVarStdVertex which reflects
|
|
|
|
|
// an actual lvalue assignment; the others do not.
|
2020-04-15 13:58:34 +02:00
|
|
|
OrderVarStdVertex* ovvp = dynamic_cast<OrderVarStdVertex*>(edgep->top());
|
2018-07-23 02:54:28 +02:00
|
|
|
if (!ovvp) continue;
|
|
|
|
|
if (ovvp->varScp()->varp()->isSc()) {
|
|
|
|
|
ovvSetSystemC.insert(ovvp);
|
|
|
|
|
} else {
|
|
|
|
|
ovvSet.insert(ovvp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Rank the graph.
|
|
|
|
|
// DGS is faster than V3GraphAlg's recursive rank, in the worst
|
|
|
|
|
// cases where the recursive rank must pass through the same node
|
|
|
|
|
// many times. (We saw 22s for DGS vs. 500s for recursive rank on
|
|
|
|
|
// one large design.)
|
|
|
|
|
{
|
|
|
|
|
GraphStreamUnordered serialize(m_mtasksp);
|
|
|
|
|
const V3GraphVertex* vertexp;
|
|
|
|
|
while ((vertexp = serialize.nextp())) {
|
|
|
|
|
uint32_t rank = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
rank = std::max(edgep->fromp()->rank() + 1, rank);
|
|
|
|
|
}
|
|
|
|
|
const_cast<V3GraphVertex*>(vertexp)->rank(rank);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// For each OrderVarVertex, look at its writer and reader mtasks.
|
|
|
|
|
//
|
|
|
|
|
// If there's a set of writers and readers at the same rank, we
|
|
|
|
|
// know these are unordered with respect to one another, so merge
|
|
|
|
|
// those mtasks all together.
|
|
|
|
|
//
|
|
|
|
|
// At this point, we have at most one merged mtask per rank (for a
|
|
|
|
|
// given OVV.) Create edges across these remaining mtasks to ensure
|
|
|
|
|
// they run in serial order (going along with the existing ranks.)
|
|
|
|
|
//
|
|
|
|
|
// NOTE: we don't update the CP's stored in the LogicMTasks to
|
|
|
|
|
// reflect the changes we make to the graph. That's OK, as we
|
|
|
|
|
// haven't yet initialized CPs when we call this routine.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (OvvSet::iterator ovvit = ovvSet.begin(); ovvit != ovvSet.end(); ++ovvit) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// Build a set of mtasks, per rank, which access this var.
|
|
|
|
|
// Within a rank, sort by MTaskID to avoid nondeterminism.
|
|
|
|
|
TasksByRank tasksByRank;
|
|
|
|
|
|
|
|
|
|
// Find all reader and writer tasks for this variable, add to
|
|
|
|
|
// tasksByRank.
|
|
|
|
|
findAdjacentTasks(ovvit, &tasksByRank);
|
|
|
|
|
|
|
|
|
|
// Merge all writer and reader tasks from same rank together.
|
|
|
|
|
//
|
|
|
|
|
// NOTE: Strictly speaking, we don't need to merge all the
|
|
|
|
|
// readers together. That may lead to extra serialization. The
|
|
|
|
|
// least amount of ordering we could impose here would be to
|
|
|
|
|
// merge all writers at a given rank together; then make edges
|
|
|
|
|
// from the merged writer node to each reader node at the same
|
|
|
|
|
// rank; and then from each reader node to the merged writer at
|
|
|
|
|
// the next rank.
|
|
|
|
|
//
|
|
|
|
|
// Whereas, merging all readers and writers at the same rank
|
|
|
|
|
// together is "the simplest thing that could possibly work"
|
|
|
|
|
// and it seems to. It also creates fairly few edges. We don't
|
|
|
|
|
// want to create tons of edges here, doing so is not nice to
|
|
|
|
|
// the main edge contraction pass.
|
|
|
|
|
mergeSameRankTasks(&tasksByRank);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle SystemC vars just a little differently. Instead of
|
|
|
|
|
// treating each var as an independent entity, and serializing
|
|
|
|
|
// writes to that one var, we treat ALL systemC vars as a single
|
|
|
|
|
// entity and serialize writes (and, conservatively, reads) across
|
|
|
|
|
// all of them.
|
|
|
|
|
//
|
|
|
|
|
// Reasoning: writing a systemC var actually turns into a call to a
|
|
|
|
|
// var.write() method, which under the hood is accessing some data
|
|
|
|
|
// structure that's shared by many SC vars. It's not thread safe.
|
|
|
|
|
//
|
|
|
|
|
// Hopefully we only have a few SC vars -- top level ports, probably.
|
|
|
|
|
{
|
|
|
|
|
TasksByRank tasksByRank;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (OvvSet::iterator ovvit = ovvSetSystemC.begin(); ovvit != ovvSetSystemC.end();
|
|
|
|
|
++ovvit) {
|
2018-07-23 02:54:28 +02:00
|
|
|
findAdjacentTasks(ovvit, &tasksByRank);
|
|
|
|
|
}
|
|
|
|
|
mergeSameRankTasks(&tasksByRank);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle nodes containing DPI calls, we want to serialize those
|
|
|
|
|
// by default unless user gave --threads-dpi-concurrent.
|
|
|
|
|
// Same basic strategy as above to serialize access to SC vars.
|
|
|
|
|
if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) {
|
|
|
|
|
TasksByRank tasksByRank;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp;
|
|
|
|
|
vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
|
2021-02-22 03:25:21 +01:00
|
|
|
if (hasDpiHazard(mtaskp)) tasksByRank[vxp->rank()].insert(mtaskp);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
mergeSameRankTasks(&tasksByRank);
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(4, "PartFixDataHazards() merged " << m_mergesDone << " pairs of nodes in "
|
|
|
|
|
<< (V3Os::timeUsecs() - startUsecs)
|
|
|
|
|
<< " usecs.\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_UNCOPYABLE(PartFixDataHazards);
|
|
|
|
|
VL_DEBUG_FUNC;
|
|
|
|
|
};
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
//######################################################################
|
|
|
|
|
// ThreadSchedule
|
|
|
|
|
|
|
|
|
|
class PartPackMTasks;
|
|
|
|
|
|
|
|
|
|
// The thread schedule, containing all information needed later. Note that this is a simple
|
|
|
|
|
// aggregate data type and the only way to get hold of an instance of it is via
|
|
|
|
|
// PartPackMTasks::pack, which is moved from there and is const, which means we can only acquire a
|
|
|
|
|
// const reference to is so no further modifications are allowed, so all members are public
|
|
|
|
|
// (attributes).
|
|
|
|
|
class ThreadSchedule final {
|
|
|
|
|
public:
|
|
|
|
|
// Allocation of sequence of MTasks to threads. Can be considered a map from thread ID to
|
|
|
|
|
// the sequence of MTasks to be executed by that thread.
|
|
|
|
|
std::vector<std::vector<const ExecMTask*>> threads;
|
|
|
|
|
|
|
|
|
|
// Map from MTask to ID of thread it is assigned to.
|
|
|
|
|
std::unordered_map<const ExecMTask*, uint32_t> threadId;
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
friend class PartPackMTasks;
|
|
|
|
|
|
|
|
|
|
explicit ThreadSchedule(uint32_t nThreads)
|
|
|
|
|
: threads{nThreads} {}
|
|
|
|
|
VL_UNCOPYABLE(ThreadSchedule); // But movable
|
|
|
|
|
ThreadSchedule(ThreadSchedule&&) = default;
|
|
|
|
|
ThreadSchedule& operator=(ThreadSchedule&&) = default;
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// Returns the number of cross-thread dependencies of the given MTask. If > 0, the MTask must
|
|
|
|
|
// test whether its dependencies are ready before starting, and therefore may need to block.
|
|
|
|
|
uint32_t crossThreadDependencies(const ExecMTask* mtaskp) const {
|
|
|
|
|
const uint32_t thisThreadId = threadId.at(mtaskp);
|
|
|
|
|
uint32_t result = 0;
|
|
|
|
|
for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
|
|
|
|
const ExecMTask* const prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
|
|
|
|
|
if (threadId.at(prevp) != thisThreadId) ++result;
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
//######################################################################
|
|
|
|
|
// PartPackMTasks
|
|
|
|
|
|
|
|
|
|
// Statically pack tasks into threads.
|
|
|
|
|
//
|
|
|
|
|
// The simplest thing that could possibly work would be to assume that our
|
|
|
|
|
// predictions of task runtimes are precise, and that every thread will
|
|
|
|
|
// make progress at an equal rate. Simulate a single "clock", pack the the
|
|
|
|
|
// highest priority ready task into whatever thread becomes ready earliest,
|
|
|
|
|
// repeating until no tasks remain.
|
|
|
|
|
//
|
|
|
|
|
// That doesn't work well, as our predictions of task runtimes have wide
|
|
|
|
|
// error bars (+/- 60% is typical.)
|
|
|
|
|
//
|
|
|
|
|
// So be a little more clever: let each task have a different end time,
|
|
|
|
|
// depending on which thread is looking. Be a little bit pessimistic when
|
|
|
|
|
// thread A checks the end time of an mtask running on thread B. This extra
|
|
|
|
|
// "padding" avoids tight "layovers" at cross-thread dependencies.
|
2020-11-19 03:32:16 +01:00
|
|
|
class PartPackMTasks final {
|
2021-06-16 13:18:56 +02:00
|
|
|
// CONSTANTS
|
|
|
|
|
static constexpr uint32_t UNASSIGNED = 0xffffffff;
|
|
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
// TYPES
|
|
|
|
|
struct MTaskState {
|
2021-06-16 13:18:56 +02:00
|
|
|
uint32_t completionTime = 0; // Estimated time this mtask will complete
|
|
|
|
|
uint32_t threadId = UNASSIGNED; // Thread id this MTask is assigned to
|
|
|
|
|
const ExecMTask* nextp = nullptr; // Next MTask on same thread after this
|
2018-07-23 02:54:28 +02:00
|
|
|
};
|
2021-06-16 13:18:56 +02:00
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
struct MTaskCmp {
|
2021-06-16 13:18:56 +02:00
|
|
|
bool operator()(const ExecMTask* ap, const ExecMTask* bp) const {
|
|
|
|
|
return ap->id() < bp->id();
|
|
|
|
|
}
|
2018-07-23 02:54:28 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// MEMBERS
|
2021-06-16 13:18:56 +02:00
|
|
|
const uint32_t m_nThreads; // Number of threads
|
|
|
|
|
const uint32_t m_sandbagNumerator; // Numerator padding for est runtime
|
|
|
|
|
const uint32_t m_sandbagDenom; // Denominator padding for est runtime
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
std::unordered_map<const ExecMTask*, MTaskState> m_mtaskState; // State for each mtask.
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
// CONSTRUCTORS
|
2021-06-16 13:18:56 +02:00
|
|
|
explicit PartPackMTasks(uint32_t nThreads = v3Global.opt.threads(),
|
2020-04-15 13:58:34 +02:00
|
|
|
unsigned sandbagNumerator = 30, unsigned sandbagDenom = 100)
|
2021-06-16 13:18:56 +02:00
|
|
|
: m_nThreads{nThreads}
|
2020-08-16 15:55:36 +02:00
|
|
|
, m_sandbagNumerator{sandbagNumerator}
|
2021-06-16 13:18:56 +02:00
|
|
|
, m_sandbagDenom{sandbagDenom} {}
|
2020-11-17 01:56:16 +01:00
|
|
|
~PartPackMTasks() = default;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
private:
|
2019-09-09 13:50:21 +02:00
|
|
|
// METHODS
|
2021-06-16 13:18:56 +02:00
|
|
|
uint32_t completionTime(const ExecMTask* mtaskp, uint32_t threadId) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const MTaskState& state = m_mtaskState[mtaskp];
|
2021-06-16 13:18:56 +02:00
|
|
|
UASSERT(state.threadId != UNASSIGNED, "Mtask should have assigned thread");
|
|
|
|
|
if (threadId == state.threadId) {
|
|
|
|
|
// No overhead on same thread
|
2018-07-23 02:54:28 +02:00
|
|
|
return state.completionTime;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add some padding to the estimated runtime when looking from
|
|
|
|
|
// another thread
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t sandbaggedEndTime
|
|
|
|
|
= state.completionTime + (m_sandbagNumerator * mtaskp->cost()) / m_sandbagDenom;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// If task B is packed after task A on thread 0, don't let thread 1
|
2021-06-16 13:18:56 +02:00
|
|
|
// think that A finishes earlier than thread 0 thinks that B
|
2018-07-23 02:54:28 +02:00
|
|
|
// finishes, otherwise we get priority inversions and fail the self
|
|
|
|
|
// test.
|
2021-06-16 13:18:56 +02:00
|
|
|
if (state.nextp) {
|
|
|
|
|
const uint32_t successorEndTime = completionTime(state.nextp, state.threadId);
|
2020-04-15 13:58:34 +02:00
|
|
|
if ((sandbaggedEndTime >= successorEndTime) && (successorEndTime > 1)) {
|
2018-07-23 02:54:28 +02:00
|
|
|
sandbaggedEndTime = successorEndTime - 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
UINFO(6, "Sandbagged end time for " << mtaskp->name() << " on th " << threadId << " = "
|
2020-04-15 13:58:34 +02:00
|
|
|
<< sandbaggedEndTime << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
return sandbaggedEndTime;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
bool isReady(const ExecMTask* mtaskp) {
|
|
|
|
|
for (V3GraphEdge* edgeInp = mtaskp->inBeginp(); edgeInp; edgeInp = edgeInp->inNextp()) {
|
|
|
|
|
const ExecMTask* const prevp = dynamic_cast<ExecMTask*>(edgeInp->fromp());
|
|
|
|
|
if (m_mtaskState[prevp].threadId == UNASSIGNED) {
|
|
|
|
|
// This predecessor is not assigned yet
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
public:
|
|
|
|
|
// Pack an MTasks from given graph into m_nThreads threads, return the schedule.
|
|
|
|
|
const ThreadSchedule pack(const V3Graph& mtaskGraph) {
|
|
|
|
|
// The result
|
|
|
|
|
ThreadSchedule schedule(m_nThreads);
|
|
|
|
|
|
|
|
|
|
// Time each thread is occupied until
|
|
|
|
|
std::vector<uint32_t> busyUntil(m_nThreads, 0);
|
|
|
|
|
|
|
|
|
|
// MTasks ready to be assigned next. All their dependencies are already assigned.
|
|
|
|
|
std::set<const ExecMTask*, MTaskCmp> readyMTasks;
|
|
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
// Build initial ready list
|
2021-06-16 13:18:56 +02:00
|
|
|
for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
|
|
|
|
const ExecMTask* const mtaskp = dynamic_cast<ExecMTask*>(vxp);
|
|
|
|
|
if (isReady(mtaskp)) readyMTasks.insert(mtaskp);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
// Clear algorithm state
|
|
|
|
|
m_mtaskState.clear();
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
while (!readyMTasks.empty()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// For each task in the ready set, compute when it might start
|
|
|
|
|
// on each thread (in that thread's local time frame.)
|
|
|
|
|
uint32_t bestTime = 0xffffffff;
|
2021-06-16 13:18:56 +02:00
|
|
|
uint32_t bestThreadId = 0;
|
|
|
|
|
const ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask*
|
|
|
|
|
for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) {
|
|
|
|
|
for (const ExecMTask* const mtaskp : readyMTasks) {
|
|
|
|
|
uint32_t timeBegin = busyUntil[threadId];
|
2018-07-23 02:54:28 +02:00
|
|
|
if (timeBegin > bestTime) {
|
2021-06-16 13:18:56 +02:00
|
|
|
UINFO(6, "th " << threadId << " busy until " << timeBegin
|
2020-04-15 13:58:34 +02:00
|
|
|
<< ", later than bestTime " << bestTime
|
|
|
|
|
<< ", skipping thread.\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
break;
|
|
|
|
|
}
|
2021-06-16 13:18:56 +02:00
|
|
|
for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep;
|
|
|
|
|
edgep = edgep->inNextp()) {
|
|
|
|
|
const ExecMTask* const priorp = dynamic_cast<ExecMTask*>(edgep->fromp());
|
|
|
|
|
const uint32_t priorEndTime = completionTime(priorp, threadId);
|
2020-04-15 13:58:34 +02:00
|
|
|
if (priorEndTime > timeBegin) timeBegin = priorEndTime;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
2021-06-16 13:18:56 +02:00
|
|
|
UINFO(6, "Task " << mtaskp->name() << " start at " << timeBegin
|
|
|
|
|
<< " on thread " << threadId << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
if ((timeBegin < bestTime)
|
|
|
|
|
|| ((timeBegin == bestTime)
|
2018-10-15 00:39:33 +02:00
|
|
|
&& bestMtaskp // Redundant, but appeases static analysis tools
|
2021-06-16 13:18:56 +02:00
|
|
|
&& (mtaskp->priority() > bestMtaskp->priority()))) {
|
2018-07-23 02:54:28 +02:00
|
|
|
bestTime = timeBegin;
|
2021-06-16 13:18:56 +02:00
|
|
|
bestThreadId = threadId;
|
|
|
|
|
bestMtaskp = mtaskp;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
UASSERT(bestMtaskp, "Should have found some task");
|
|
|
|
|
UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << bestThreadId
|
|
|
|
|
<< endl);
|
|
|
|
|
|
|
|
|
|
// Reference to thread in schedule we are assigning this MTask to.
|
|
|
|
|
std::vector<const ExecMTask*>& bestThread = schedule.threads[bestThreadId];
|
|
|
|
|
|
|
|
|
|
// Update algorithm state
|
|
|
|
|
const uint32_t bestEndTime = bestTime + bestMtaskp->cost();
|
|
|
|
|
m_mtaskState[bestMtaskp].completionTime = bestEndTime;
|
|
|
|
|
m_mtaskState[bestMtaskp].threadId = bestThreadId;
|
|
|
|
|
if (!bestThread.empty()) { m_mtaskState[bestThread.back()].nextp = bestMtaskp; }
|
|
|
|
|
busyUntil[bestThreadId] = bestEndTime;
|
|
|
|
|
|
|
|
|
|
// Add the MTask to the schedule
|
|
|
|
|
bestThread.push_back(bestMtaskp);
|
|
|
|
|
schedule.threadId[bestMtaskp] = bestThreadId;
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Update the ready list
|
2021-06-16 13:18:56 +02:00
|
|
|
const size_t erased = readyMTasks.erase(bestMtaskp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?");
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp;
|
|
|
|
|
edgeOutp = edgeOutp->outNextp()) {
|
2021-06-16 13:18:56 +02:00
|
|
|
const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
|
|
|
|
|
// Dependent MTask should not yet be assigned to a thread
|
|
|
|
|
UASSERT(m_mtaskState[nextp].threadId == UNASSIGNED,
|
2018-07-23 02:54:28 +02:00
|
|
|
"Tasks after one being assigned should not be assigned yet");
|
2021-06-16 13:18:56 +02:00
|
|
|
// Dependent MTask should not be ready yet, since dependency is just being assigned
|
|
|
|
|
UASSERT_OBJ(readyMTasks.find(nextp) == readyMTasks.end(), nextp,
|
2019-07-06 18:57:50 +02:00
|
|
|
"Tasks after one being assigned should not be ready");
|
2021-06-16 13:18:56 +02:00
|
|
|
if (isReady(nextp)) {
|
|
|
|
|
readyMTasks.insert(nextp);
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(6, "Inserted " << nextp->name() << " into ready\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2021-06-16 13:18:56 +02:00
|
|
|
|
|
|
|
|
return schedule;
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SELF TEST
|
|
|
|
|
static void selfTest() {
|
|
|
|
|
V3Graph graph;
|
2021-06-16 13:18:56 +02:00
|
|
|
ExecMTask* const t0 = new ExecMTask(&graph, nullptr, 0);
|
2018-07-23 02:54:28 +02:00
|
|
|
t0->cost(1000);
|
|
|
|
|
t0->priority(1100);
|
2021-06-16 13:18:56 +02:00
|
|
|
ExecMTask* const t1 = new ExecMTask(&graph, nullptr, 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
t1->cost(100);
|
|
|
|
|
t1->priority(100);
|
2021-06-16 13:18:56 +02:00
|
|
|
ExecMTask* const t2 = new ExecMTask(&graph, nullptr, 2);
|
2018-07-23 02:54:28 +02:00
|
|
|
t2->cost(100);
|
|
|
|
|
t2->priority(100);
|
|
|
|
|
|
|
|
|
|
new V3GraphEdge(&graph, t0, t1, 1);
|
|
|
|
|
new V3GraphEdge(&graph, t0, t2, 1);
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
PartPackMTasks packer(2, // Threads
|
2018-07-23 02:54:28 +02:00
|
|
|
3, // Sandbag numerator
|
|
|
|
|
10); // Sandbag denom
|
2021-06-16 13:18:56 +02:00
|
|
|
const ThreadSchedule& schedule = packer.pack(graph);
|
|
|
|
|
|
|
|
|
|
UASSERT_SELFTEST(size_t, schedule.threads.size(), 2);
|
|
|
|
|
|
|
|
|
|
UASSERT_SELFTEST(size_t, schedule.threads[0].size(), 2);
|
|
|
|
|
UASSERT_SELFTEST(size_t, schedule.threads[1].size(), 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][0], t0);
|
|
|
|
|
UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][1], t1);
|
|
|
|
|
UASSERT_SELFTEST(const ExecMTask*, schedule.threads[1][0], t2);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
UASSERT_SELFTEST(size_t, schedule.threadId.size(), 3);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
UASSERT_SELFTEST(uint32_t, schedule.threadId.at(t0), 0);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, schedule.threadId.at(t1), 0);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, schedule.threadId.at(t2), 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// On its native thread, we see the actual end time for t0:
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 0), 1000);
|
|
|
|
|
// On the other thread, we see a sandbagged end time which does not
|
|
|
|
|
// exceed the t1 end time:
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 1), 1099);
|
|
|
|
|
|
|
|
|
|
// Actual end time on native thread:
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 0), 1100);
|
|
|
|
|
// Sandbagged end time seen on thread 1. Note it does not compound
|
|
|
|
|
// with t0's sandbagged time; compounding caused trouble in
|
|
|
|
|
// practice.
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 1), 1130);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 0), 1229);
|
|
|
|
|
UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 1), 1199);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
VL_DEBUG_FUNC; // Declare debug()
|
|
|
|
|
VL_UNCOPYABLE(PartPackMTasks);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
//######################################################################
|
|
|
|
|
// V3Partition implementation
|
|
|
|
|
|
|
|
|
|
void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) {
|
|
|
|
|
if (!debug()) return;
|
|
|
|
|
|
|
|
|
|
UINFO(4, "\n");
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(4, " Stats for " << stage << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
uint32_t mtaskCount = 0;
|
|
|
|
|
uint32_t totalCost = 0;
|
2020-11-15 22:21:26 +01:00
|
|
|
std::array<uint32_t, 32> mtaskCostHist;
|
|
|
|
|
mtaskCostHist.fill(0);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
for (const V3GraphVertex* mtaskp = graphp->verticesBeginp(); mtaskp;
|
|
|
|
|
mtaskp = mtaskp->verticesNextp()) {
|
|
|
|
|
++mtaskCount;
|
|
|
|
|
uint32_t mtaskCost = dynamic_cast<const AbstractMTask*>(mtaskp)->cost();
|
|
|
|
|
totalCost += mtaskCost;
|
|
|
|
|
|
|
|
|
|
unsigned log2Cost = 0;
|
|
|
|
|
while (mtaskCost >>= 1) ++log2Cost;
|
|
|
|
|
UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats");
|
|
|
|
|
++mtaskCostHist[log2Cost];
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(4, " Total mtask cost = " << totalCost << "\n");
|
|
|
|
|
UINFO(4, " Mtask count = " << mtaskCount << "\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
UINFO(4, " Avg cost / mtask = "
|
2020-04-15 13:58:34 +02:00
|
|
|
<< ((mtaskCount > 0) ? cvtToStr(totalCost / mtaskCount) : "INF!") << "\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
UINFO(4, " Histogram of mtask costs:\n");
|
|
|
|
|
for (unsigned i = 0; i < 32; ++i) {
|
|
|
|
|
if (mtaskCostHist[i]) {
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(4, " 2^" << i << ": " << mtaskCostHist[i] << endl);
|
|
|
|
|
V3Stats::addStat("MTask graph, " + stage + ", mtask cost 2^" + (i < 10 ? " " : "")
|
|
|
|
|
+ cvtToStr(i),
|
|
|
|
|
mtaskCostHist[i]);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (mtaskCount < 1000) {
|
|
|
|
|
string filePrefix("ordermv_");
|
|
|
|
|
filePrefix += stage;
|
|
|
|
|
if (debug() >= 4) graphp->dumpDotFilePrefixedAlways(filePrefix);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Look only at the cost of each mtask, neglect communication cost.
|
|
|
|
|
// This will show us how much parallelism we expect, assuming cache-miss
|
|
|
|
|
// costs are minor and the cost of running logic is the dominant cost.
|
|
|
|
|
PartParallelismEst vertexParEst(graphp);
|
|
|
|
|
vertexParEst.traverse();
|
|
|
|
|
vertexParEst.statsReport(stage);
|
2020-04-15 13:58:34 +02:00
|
|
|
if (debug() >= 4) {
|
2018-07-23 02:54:28 +02:00
|
|
|
UINFO(0, "\n");
|
|
|
|
|
UINFO(0, " Parallelism estimate for based on mtask costs:\n");
|
|
|
|
|
vertexParEst.debugReport();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Print a hash of the shape of graphp. If you are battling
|
|
|
|
|
// nondeterminism, this can help to pinpoint where in the pipeline it's
|
|
|
|
|
// creeping in.
|
|
|
|
|
void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) {
|
|
|
|
|
// Disabled when there are no nondeterminism issues in flight.
|
|
|
|
|
if (!v3Global.opt.debugNondeterminism()) return;
|
|
|
|
|
|
2020-08-15 16:03:34 +02:00
|
|
|
std::unordered_map<const V3GraphVertex*, uint32_t> vx2Id;
|
2018-07-23 02:54:28 +02:00
|
|
|
unsigned id = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
vx2Id[vxp] = id++;
|
|
|
|
|
}
|
|
|
|
|
unsigned hash = 0;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) {
|
|
|
|
|
for (const V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
const V3GraphVertex* top = edgep->top();
|
2020-04-16 03:47:37 +02:00
|
|
|
hash = vx2Id[top] + 31U * hash; // The K&R hash function
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void V3Partition::setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp) {
|
|
|
|
|
// Look at each mtask
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
|
|
|
|
|
const LogicMTask::VxList* vertexListp = mtaskp->vertexListp();
|
|
|
|
|
|
|
|
|
|
// For each logic vertex in this mtask, create an mtask-to-mtask
|
|
|
|
|
// edge based on the logic-to-logic edge.
|
|
|
|
|
for (LogicMTask::VxList::const_iterator vit = vertexListp->begin();
|
|
|
|
|
vit != vertexListp->end(); ++vit) {
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* outp = (*vit)->outBeginp(); outp; outp = outp->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT(outp->weight() > 0, "Mtask not assigned weight");
|
2020-04-15 13:58:34 +02:00
|
|
|
const MTaskMoveVertex* top = dynamic_cast<MTaskMoveVertex*>(outp->top());
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT(top, "MoveVertex not associated to mtask");
|
2020-08-16 17:43:49 +02:00
|
|
|
const auto it = vlstd::as_const(vx2mtaskp)->find(top);
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT(it != vx2mtaskp->end(), "MTask map can't find id");
|
|
|
|
|
LogicMTask* otherMTaskp = it->second;
|
2020-08-15 16:12:55 +02:00
|
|
|
UASSERT(otherMTaskp, "nullptr other Mtask");
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(otherMTaskp != mtaskp, mtaskp, "Would create a cycle edge");
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Don't create redundant edges.
|
2020-04-15 13:58:34 +02:00
|
|
|
if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) { //
|
2018-07-23 02:54:28 +02:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void V3Partition::go(V3Graph* mtasksp) {
|
|
|
|
|
// Called by V3Order
|
|
|
|
|
hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps");
|
|
|
|
|
|
|
|
|
|
// Create the first MTasks. Initially, each MTask just wraps one
|
|
|
|
|
// MTaskMoveVertex. Over time, we'll merge MTasks together and
|
|
|
|
|
// eventually each MTask will wrap a large number of MTaskMoveVertices
|
|
|
|
|
// (and the logic nodes therein.)
|
|
|
|
|
uint32_t totalGraphCost = 0;
|
|
|
|
|
{
|
|
|
|
|
// The V3InstrCount within LogicMTask will set user5 on each AST
|
|
|
|
|
// node, to assert that we never count any node twice.
|
|
|
|
|
AstUser5InUse inUser5;
|
|
|
|
|
Vx2MTaskMap vx2mtask;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = m_fineDepsGraphp->verticesBeginp(); vxp;
|
|
|
|
|
vxp = vxp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
MTaskMoveVertex* mtmvVxp = dynamic_cast<MTaskMoveVertex*>(vxp);
|
2019-07-06 18:57:50 +02:00
|
|
|
UASSERT_OBJ(mtmvVxp, vxp, "Every vertex here should be an MTaskMoveVertex");
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
LogicMTask* mtaskp = new LogicMTask(mtasksp, mtmvVxp);
|
|
|
|
|
vx2mtask[mtmvVxp] = mtaskp;
|
|
|
|
|
|
|
|
|
|
totalGraphCost += mtaskp->cost();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create the mtask->mtask dep edges based on vertex deps
|
|
|
|
|
setupMTaskDeps(mtasksp, &vx2mtask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
V3Partition::debugMTaskGraphStats(mtasksp, "initial");
|
|
|
|
|
|
|
|
|
|
// For debug: print out the longest critical path. This allows us to
|
|
|
|
|
// verify that the costs look reasonable, that we aren't combining
|
|
|
|
|
// nodes that should probably be split, etc.
|
|
|
|
|
if (v3Global.opt.dumpTreeLevel(__FILE__) >= 3) {
|
|
|
|
|
LogicMTask::dumpCpFilePrefixed(mtasksp, "cp");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Merge nodes that could present data hazards; see comment within.
|
|
|
|
|
{
|
|
|
|
|
PartFixDataHazards(mtasksp).go();
|
|
|
|
|
V3Partition::debugMTaskGraphStats(mtasksp, "hazards");
|
|
|
|
|
hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Setup the critical path into and out of each node.
|
|
|
|
|
partInitCriticalPaths(mtasksp);
|
|
|
|
|
hashGraphDebug(mtasksp, "after partInitCriticalPaths()");
|
|
|
|
|
|
|
|
|
|
// Order the graph. We know it's already ranked from fixDataHazards()
|
|
|
|
|
// so we don't need to rank it again.
|
|
|
|
|
//
|
|
|
|
|
// On at least some models, ordering the graph here seems to help
|
|
|
|
|
// performance. (Why? Is it just triggering noise in a lucky direction?
|
|
|
|
|
// Is it just as likely to harm results?)
|
|
|
|
|
//
|
|
|
|
|
// More diversity of models that can build with --threads will
|
|
|
|
|
// eventually tell us. For now keep the order() so we don't forget
|
|
|
|
|
// about it, in case it actually helps. TODO: get more data and maybe
|
|
|
|
|
// remove this later if it doesn't really help.
|
|
|
|
|
mtasksp->orderPreRanked();
|
|
|
|
|
|
2021-06-21 00:32:57 +02:00
|
|
|
const int targetParFactor = v3Global.opt.threads();
|
2021-02-22 03:25:21 +01:00
|
|
|
if (targetParFactor < 2) v3fatalSrc("We should not reach V3Partition when --threads <= 1");
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Set cpLimit to roughly totalGraphCost / nThreads
|
|
|
|
|
//
|
|
|
|
|
// Actually set it a bit lower, by a hardcoded fudge factor. This
|
|
|
|
|
// results in more smaller mtasks, which helps reduce fragmentation
|
|
|
|
|
// when scheduling them.
|
2021-06-21 00:32:57 +02:00
|
|
|
const unsigned fudgeNumerator = 3;
|
|
|
|
|
const unsigned fudgeDenominator = 5;
|
2020-04-15 13:58:34 +02:00
|
|
|
uint32_t cpLimit = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator));
|
|
|
|
|
UINFO(4, "V3Partition set cpLimit = " << cpLimit << endl);
|
2018-07-23 02:54:28 +02:00
|
|
|
|
|
|
|
|
// Merge MTask nodes together, repeatedly, until the CP budget is
|
|
|
|
|
// reached. Coarsens the graph, usually by several orders of
|
|
|
|
|
// magnitude.
|
|
|
|
|
//
|
|
|
|
|
// Some tests disable this, hence the test on threadsCoarsen().
|
|
|
|
|
// Coarsening is always enabled in production.
|
|
|
|
|
if (v3Global.opt.threadsCoarsen()) {
|
|
|
|
|
PartContraction(mtasksp, cpLimit,
|
|
|
|
|
// --debugPartition is used by tests
|
|
|
|
|
// to enable slow assertions.
|
2020-04-15 13:58:34 +02:00
|
|
|
v3Global.opt.debugPartition())
|
|
|
|
|
.go();
|
2018-07-23 02:54:28 +02:00
|
|
|
V3Partition::debugMTaskGraphStats(mtasksp, "contraction");
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
mtasksp->removeTransitiveEdges();
|
|
|
|
|
V3Partition::debugMTaskGraphStats(mtasksp, "transitive1");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reassign MTask IDs onto smaller numbers, which should be more stable
|
|
|
|
|
// across small logic changes. Keep MTask IDs in the same relative
|
|
|
|
|
// order though, otherwise we break CmpLogicMTask for still-existing
|
|
|
|
|
// EdgeSet's that haven't destructed yet.
|
|
|
|
|
{
|
2021-03-13 00:10:45 +01:00
|
|
|
using SortedMTaskSet = std::set<LogicMTask*, LogicMTask::CmpLogicMTask>;
|
2018-07-23 02:54:28 +02:00
|
|
|
SortedMTaskSet sorted;
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
|
|
|
|
|
sorted.insert(mtaskp);
|
|
|
|
|
}
|
|
|
|
|
uint32_t nextId = 1;
|
2021-03-12 23:26:53 +01:00
|
|
|
for (auto it = sorted.begin(); it != sorted.end(); ++it) {
|
2018-07-23 02:54:28 +02:00
|
|
|
// We shouldn't perturb the sort order of the set, despite
|
|
|
|
|
// changing the IDs, they should all just remain in the same
|
|
|
|
|
// relative order. Confirm that:
|
|
|
|
|
UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here");
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
(*it)->id(nextId);
|
|
|
|
|
nextId++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Set color to indicate an mtaskId on every underlying MTaskMoveVertex.
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
|
2020-04-15 13:58:34 +02:00
|
|
|
for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin();
|
2018-07-23 02:54:28 +02:00
|
|
|
it != mtaskp->vertexListp()->end(); ++it) {
|
|
|
|
|
MTaskMoveVertex* mvertexp = *it;
|
|
|
|
|
mvertexp->color(mtaskp->id());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
static void finalizeCosts(V3Graph* execMTaskGraphp) {
|
2018-07-23 02:54:28 +02:00
|
|
|
GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE);
|
|
|
|
|
|
|
|
|
|
while (const V3GraphVertex* vxp = ser.nextp()) {
|
|
|
|
|
ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
|
|
|
|
|
uint32_t costCount = V3InstrCount::count(mtp->bodyp(), false);
|
|
|
|
|
mtp->cost(costCount);
|
|
|
|
|
mtp->priority(costCount);
|
|
|
|
|
|
|
|
|
|
// "Priority" is the critical path from the start of the mtask, to
|
|
|
|
|
// the end of the graph reachable from this mtask. Given the
|
|
|
|
|
// choice among several ready mtasks, we'll want to start the
|
|
|
|
|
// highest priority one first, so we're always working on the "long
|
|
|
|
|
// pole"
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphEdge* edgep = mtp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
2018-07-23 02:54:28 +02:00
|
|
|
ExecMTask* followp = dynamic_cast<ExecMTask*>(edgep->top());
|
|
|
|
|
if ((followp->priority() + mtp->cost()) > mtp->priority()) {
|
|
|
|
|
mtp->priority(followp->priority() + mtp->cost());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Some MTasks may now have zero cost, eliminate those.
|
|
|
|
|
// (It's common for tasks to shrink to nothing when V3LifePost
|
|
|
|
|
// removes dly assignments.)
|
2020-04-15 13:58:34 +02:00
|
|
|
for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;) {
|
2018-07-23 02:54:28 +02:00
|
|
|
ExecMTask* mtp = dynamic_cast<ExecMTask*>(vxp);
|
|
|
|
|
vxp = vxp->verticesNextp(); // Advance before delete
|
|
|
|
|
|
|
|
|
|
// Don't rely on checking mtp->cost() == 0 to detect an empty task.
|
|
|
|
|
// Our cost-estimating logic is just an estimate. Instead, check
|
|
|
|
|
// the MTaskBody to see if it's empty. That's the source of truth.
|
|
|
|
|
AstMTaskBody* bodyp = mtp->bodyp();
|
|
|
|
|
if (!bodyp->stmtsp()) { // Kill this empty mtask
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(6, "Removing zero-cost " << mtp->name() << endl);
|
|
|
|
|
for (V3GraphEdge* inp = mtp->inBeginp(); inp; inp = inp->inNextp()) {
|
|
|
|
|
for (V3GraphEdge* outp = mtp->outBeginp(); outp; outp = outp->outNextp()) {
|
|
|
|
|
new V3GraphEdge(execMTaskGraphp, inp->fromp(), outp->top(), 1);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
2020-01-17 02:17:11 +01:00
|
|
|
VL_DO_DANGLING(mtp->unlinkDelete(execMTaskGraphp), mtp);
|
2018-07-23 02:54:28 +02:00
|
|
|
// Also remove and delete the AstMTaskBody, otherwise it would
|
|
|
|
|
// keep a dangling pointer to the ExecMTask.
|
2020-01-17 02:17:11 +01:00
|
|
|
VL_DO_DANGLING(bodyp->unlinkFrBack()->deleteTree(), bodyp);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Removing tasks may cause edges that were formerly non-transitive to
|
|
|
|
|
// become transitive. Also we just created new edges around the removed
|
|
|
|
|
// tasks, which could be transitive. Prune out all transitive edges.
|
|
|
|
|
{
|
|
|
|
|
execMTaskGraphp->removeTransitiveEdges();
|
2020-04-15 13:58:34 +02:00
|
|
|
V3Partition::debugMTaskGraphStats(execMTaskGraphp, "transitive2");
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Record summary stats for final m_tasks graph.
|
|
|
|
|
// (More verbose stats are available with --debugi-V3Partition >= 3.)
|
|
|
|
|
PartParallelismEst parEst(execMTaskGraphp);
|
|
|
|
|
parEst.traverse();
|
|
|
|
|
parEst.statsReport("final");
|
|
|
|
|
if (debug() >= 3) {
|
2020-04-15 13:58:34 +02:00
|
|
|
UINFO(0, " Final mtask parallelism report:\n");
|
2018-07-23 02:54:28 +02:00
|
|
|
parEst.debugReport();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId,
|
|
|
|
|
AstCFunc* funcp, const ExecMTask* mtaskp) {
|
|
|
|
|
AstNodeModule* const modp = v3Global.rootp()->topModulep();
|
|
|
|
|
FileLine* const fl = modp->fileline();
|
|
|
|
|
|
|
|
|
|
// Helper function to make the code a bit more legible
|
|
|
|
|
const auto addStrStmt = [=](const string& stmt) -> void { //
|
|
|
|
|
funcp->addStmtsp(new AstCStmt(fl, stmt));
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (const uint32_t nDependencies = schedule.crossThreadDependencies(mtaskp)) {
|
|
|
|
|
// This mtask has dependencies executed on another thread, so it may block. Create the task
|
|
|
|
|
// state variable and wait to be notified.
|
|
|
|
|
const string name = "__Vm_mtaskstate_" + cvtToStr(mtaskp->id());
|
|
|
|
|
AstBasicDType* const mtaskStateDtypep
|
|
|
|
|
= v3Global.rootp()->typeTablep()->findBasicDType(fl, AstBasicDTypeKwd::MTASKSTATE);
|
|
|
|
|
AstVar* const varp = new AstVar(fl, AstVarType::MODULETEMP, name, mtaskStateDtypep);
|
|
|
|
|
varp->valuep(new AstConst(fl, nDependencies));
|
|
|
|
|
varp->protect(false); // Do not protect as we still have references in AstText
|
|
|
|
|
modp->addStmtp(varp);
|
|
|
|
|
// For now, reference is still via text bashing
|
|
|
|
|
addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
string recName;
|
|
|
|
|
if (v3Global.opt.profThreads()) {
|
|
|
|
|
recName = "__Vprfthr_" + cvtToStr(mtaskp->id());
|
|
|
|
|
addStrStmt("VlProfileRec* " + recName + " = nullptr;\n");
|
|
|
|
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
|
|
|
|
addStrStmt("if (VL_UNLIKELY(vlSelf->__Vm_profile_cycle_start)) {\n" + //
|
|
|
|
|
recName + " = vlSelf->__Vm_threadPoolp->profileAppend();\n" + //
|
|
|
|
|
recName + "->startRecord(VL_RDTSC_Q() - vlSelf->__Vm_profile_cycle_start," + //
|
|
|
|
|
" " + cvtToStr(mtaskp->id()) + "," + //
|
|
|
|
|
" " + cvtToStr(mtaskp->cost()) + ");\n" + //
|
|
|
|
|
"}\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
addStrStmt("Verilated::mtaskId(" + cvtToStr(mtaskp->id()) + ");\n");
|
|
|
|
|
|
|
|
|
|
// Move the the actual body of calls to leaf functions into this function
|
|
|
|
|
funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
|
|
|
|
|
|
|
|
|
|
if (v3Global.opt.profThreads()) {
|
|
|
|
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
|
|
|
|
addStrStmt("if (VL_UNLIKELY(" + recName + ")) {\n" + //
|
|
|
|
|
recName + "->endRecord(VL_RDTSC_Q() - vlSelf->__Vm_profile_cycle_start);\n"
|
|
|
|
|
+ "}\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Flush message queue
|
|
|
|
|
addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
|
|
|
|
|
|
|
|
|
|
// For any dependent mtask that's on another thread, signal one dependency completion.
|
|
|
|
|
for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) {
|
|
|
|
|
const ExecMTask* const nextp = dynamic_cast<ExecMTask*>(edgep->top());
|
|
|
|
|
if (schedule.threadId.at(nextp) != threadId) {
|
|
|
|
|
addStrStmt("vlSelf->__Vm_mtaskstate_" + cvtToStr(nextp->id())
|
|
|
|
|
+ ".signalUpstreamDone(even_cycle);\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const std::vector<AstCFunc*> createThreadFunctions(const ThreadSchedule& schedule) {
|
|
|
|
|
AstNodeModule* const modp = v3Global.rootp()->topModulep();
|
|
|
|
|
FileLine* const fl = modp->fileline();
|
|
|
|
|
|
|
|
|
|
std::vector<AstCFunc*> funcps;
|
|
|
|
|
|
|
|
|
|
// For each thread, create a function representing its entry point
|
|
|
|
|
for (const std::vector<const ExecMTask*>& thread : schedule.threads) {
|
|
|
|
|
if (thread.empty()) continue;
|
|
|
|
|
const uint32_t threadId = schedule.threadId.at(thread.front());
|
|
|
|
|
string name = "__Vthread_";
|
|
|
|
|
name += cvtToStr(threadId);
|
|
|
|
|
AstCFunc* const funcp = new AstCFunc(fl, name, nullptr, "void");
|
|
|
|
|
modp->addStmtp(funcp);
|
|
|
|
|
funcps.push_back(funcp);
|
|
|
|
|
funcp->isStatic(true); // Uses void self pointer, so static and hand rolled
|
|
|
|
|
funcp->isLoose(true);
|
|
|
|
|
funcp->entryPoint(true);
|
|
|
|
|
funcp->argTypes("void* voidSelf, bool even_cycle");
|
|
|
|
|
|
|
|
|
|
// Setup vlSelf an vlSyms
|
|
|
|
|
funcp->addStmtsp(new AstCStmt(fl, EmitCBaseVisitor::voidSelfAssign()));
|
|
|
|
|
funcp->addStmtsp(new AstCStmt(fl, EmitCBaseVisitor::symClassAssign()));
|
|
|
|
|
|
|
|
|
|
// Invoke each mtask scheduled to this thread from the thread function
|
|
|
|
|
for (const ExecMTask* const mtaskp : thread) {
|
|
|
|
|
addMTaskToFunction(schedule, threadId, funcp, mtaskp);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Unblock the fake "final" mtask when this thread is finished
|
|
|
|
|
funcp->addStmtsp(
|
|
|
|
|
new AstCStmt(fl, "vlSelf->__Vm_mtaskstate_final.signalUpstreamDone(even_cycle);\n"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Create the fake "final" mtask state variable
|
|
|
|
|
AstBasicDType* const mtaskStateDtypep
|
|
|
|
|
= v3Global.rootp()->typeTablep()->findBasicDType(fl, AstBasicDTypeKwd::MTASKSTATE);
|
|
|
|
|
AstVar* const varp
|
|
|
|
|
= new AstVar(fl, AstVarType::MODULETEMP, "__Vm_mtaskstate_final", mtaskStateDtypep);
|
|
|
|
|
varp->valuep(new AstConst(fl, funcps.size()));
|
|
|
|
|
varp->protect(false); // Do not protect as we still have references in AstText
|
|
|
|
|
modp->addStmtp(varp);
|
|
|
|
|
|
|
|
|
|
return funcps;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void addThreadStartToExecGraph(AstExecGraph* const execGraphp,
|
|
|
|
|
const std::vector<AstCFunc*>& funcps) {
|
|
|
|
|
// FileLine used for constructing nodes below
|
|
|
|
|
FileLine* const fl = v3Global.rootp()->fileline();
|
|
|
|
|
|
|
|
|
|
// Add thread function invocations to execGraph
|
|
|
|
|
const auto addStrStmt = [=](const string& stmt) -> void { //
|
|
|
|
|
execGraphp->addStmtsp(new AstCStmt(fl, stmt));
|
|
|
|
|
};
|
|
|
|
|
const auto addTextStmt = [=](const string& text) -> void {
|
|
|
|
|
execGraphp->addStmtsp(new AstText(fl, text, /* tracking: */ true));
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
addStrStmt("vlSelf->__Vm_even_cycle = !vlSelf->__Vm_even_cycle;\n");
|
|
|
|
|
|
|
|
|
|
const uint32_t last = funcps.size() - 1;
|
|
|
|
|
for (uint32_t i = 0; i <= last; ++i) {
|
|
|
|
|
AstCFunc* const funcp = funcps.at(i);
|
|
|
|
|
if (i != last) {
|
|
|
|
|
// The first N-1 will run on the thread pool.
|
|
|
|
|
addTextStmt("vlSelf->__Vm_threadPoolp->workerp(" + cvtToStr(i) + ")->addTask(");
|
|
|
|
|
execGraphp->addStmtsp(new AstAddrOfCFunc(fl, funcp));
|
|
|
|
|
addTextStmt(", vlSelf, vlSelf->__Vm_even_cycle);\n");
|
|
|
|
|
} else {
|
|
|
|
|
// The last will run on the main thread.
|
|
|
|
|
AstCCall* const callp = new AstCCall(fl, funcp);
|
|
|
|
|
callp->argTypes("vlSelf, vlSelf->__Vm_even_cycle");
|
|
|
|
|
execGraphp->addStmtsp(callp);
|
|
|
|
|
addStrStmt("Verilated::mtaskId(0);\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addStrStmt("vlSelf->__Vm_mtaskstate_final.waitUntilUpstreamDone(vlSelf->__Vm_even_cycle);\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void implementExecGraph(AstExecGraph* const execGraphp) {
|
|
|
|
|
// Nothing to be done if there are no MTasks in the graph at all.
|
|
|
|
|
if (execGraphp->depGraphp()->empty()) return;
|
|
|
|
|
|
|
|
|
|
// Schedule the mtasks: statically associate each mtask with a thread,
|
|
|
|
|
// and determine the order in which each thread will runs its mtasks.
|
|
|
|
|
const ThreadSchedule& schedule = PartPackMTasks().pack(*execGraphp->mutableDepGraphp());
|
|
|
|
|
|
|
|
|
|
// Create a function to be run by each thread. Note this moves all AstMTaskBody nodes form the
|
|
|
|
|
// AstExecGrap into the AstCFunc created
|
|
|
|
|
const std::vector<AstCFunc*>& funcps = createThreadFunctions(schedule);
|
|
|
|
|
UASSERT(!funcps.empty(), "Non-empty ExecGraph yields no threads?");
|
|
|
|
|
|
|
|
|
|
// Start the thread functions at the point this AstExecGraph is located in the tree.
|
|
|
|
|
addThreadStartToExecGraph(execGraphp, funcps);
|
|
|
|
|
}
|
|
|
|
|
|
2018-07-23 02:54:28 +02:00
|
|
|
void V3Partition::finalize() {
|
|
|
|
|
// Called by Verilator top stage
|
2021-06-16 13:18:56 +02:00
|
|
|
AstExecGraph* const execGraphp = v3Global.rootp()->execGraphp();
|
2018-07-23 02:54:28 +02:00
|
|
|
UASSERT(execGraphp, "Couldn't find AstExecGraph singleton.");
|
|
|
|
|
|
|
|
|
|
// Back in V3Order, we partitioned mtasks using provisional cost
|
|
|
|
|
// estimates. However, V3Order precedes some optimizations (notably
|
|
|
|
|
// V3LifePost) that can change the cost of logic within each mtask.
|
|
|
|
|
// Now that logic is final, recompute the cost and priority of each
|
|
|
|
|
// ExecMTask.
|
|
|
|
|
finalizeCosts(execGraphp->mutableDepGraphp());
|
|
|
|
|
|
2021-06-16 13:18:56 +02:00
|
|
|
// Replace the graph body with it's multi-threaded implementation.
|
|
|
|
|
implementExecGraph(execGraphp);
|
2018-07-23 02:54:28 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void V3Partition::selfTest() {
|
|
|
|
|
PartPropagateCpSelfTest::selfTest();
|
|
|
|
|
PartPackMTasks::selfTest();
|
|
|
|
|
PartContraction::selfTest();
|
|
|
|
|
}
|