Factor out graph parallelism report into a generic algorithm (#4957)

This is a generic algorithm parametrised by a cost function, so
implement it as such for easy reuse.
This commit is contained in:
Geza Lore 2024-03-10 14:56:43 +00:00 committed by GitHub
parent 1481d34959
commit a686e547cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 129 additions and 114 deletions

View File

@ -26,6 +26,7 @@
#include "V3ThreadSafety.h"
#include <algorithm>
#include <functional>
class FileLine;
class V3Graph;
@ -182,6 +183,38 @@ public:
void dumpEdges(std::ostream& os, const V3GraphVertex* vertexp) const VL_MT_DISABLED;
static void selfTest() VL_MT_DISABLED;
class ParallelismReport final {
friend class GraphAlgParallelismReport;
// Total cost of evaluating the whole graph. The ratio of m_totalGraphCost to
// m_criticalPathCost gives us an estimate of the parallelizability of this graph which is
// only as good as the guess returned by vertexCost.
uint32_t m_totalGraphCost = 0;
// Cost of the critical path, in abstract units (the same units returned by the vertexCost)
uint32_t m_criticalPathCost = 0;
size_t m_vertexCount = 0; // Number of vertexes in the graph
size_t m_edgeCount = 0; // Number of edges in the grap
ParallelismReport() = default;
public:
~ParallelismReport() = default;
ParallelismReport(const ParallelismReport&) = default;
ParallelismReport& operator=(const ParallelismReport&) = default;
uint32_t totalGraphCost() const { return m_totalGraphCost; }
uint32_t criticalPathCost() const { return m_criticalPathCost; }
size_t vertexCount() const { return m_vertexCount; }
size_t edgeCount() const { return m_edgeCount; }
double parallelismFactor() const {
return (static_cast<double>(m_totalGraphCost) / m_criticalPathCost);
}
};
ParallelismReport parallelismReport(
std::function<uint32_t(const V3GraphVertex*)> vertexCost) const VL_MT_DISABLED;
// CALLBACKS
virtual void loopsMessageCb(V3GraphVertex* vertexp) VL_MT_DISABLED;
virtual void loopsVertexCb(V3GraphVertex* vertexp) VL_MT_DISABLED;

View File

@ -23,6 +23,8 @@
#include "V3Global.h"
#include "V3GraphPathChecker.h"
#include "V3GraphStream.h"
#include "V3Stats.h"
#include <algorithm>
#include <list>
@ -515,3 +517,59 @@ double V3Graph::orderDFSIterate(V3GraphVertex* vertexp) {
vertexp->user(2);
return vertexp->fanout();
}
//######################################################################
//######################################################################
// Algorithms - parallelism report
class GraphAlgParallelismReport final {
// MEMBERS
const V3Graph& m_graph; // The graph
const std::function<uint32_t(const V3GraphVertex*)> m_vertexCost; // vertex cost function
V3Graph::ParallelismReport m_report; // The result report
// CONSTRUCTORS
explicit GraphAlgParallelismReport(const V3Graph& graph,
std::function<uint32_t(const V3GraphVertex*)> vertexCost)
: m_graph{graph}
, m_vertexCost{vertexCost} {
// For each node, record the critical path cost from the start
// of the graph through the end of the node.
std::unordered_map<const V3GraphVertex*, uint32_t> critPaths;
GraphStreamUnordered serialize{&m_graph};
for (const V3GraphVertex* vertexp; (vertexp = serialize.nextp());) {
++m_report.m_vertexCount;
uint32_t cpCostToHere = 0;
for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) {
++m_report.m_edgeCount;
// For each upstream item, add its critical path cost to
// the cost of this edge, to form a new candidate critical
// path cost to the current node. Whichever is largest is
// the critical path to reach the start of this node.
cpCostToHere = std::max(cpCostToHere, critPaths[edgep->fromp()]);
}
// Include the cost of the current vertex in the critical
// path, so it represents the critical path to the end of
// this vertex.
cpCostToHere += m_vertexCost(vertexp);
critPaths[vertexp] = cpCostToHere;
m_report.m_criticalPathCost = std::max(m_report.m_criticalPathCost, cpCostToHere);
// Tally the total cost contributed by vertices.
m_report.m_totalGraphCost += m_vertexCost(vertexp);
}
}
~GraphAlgParallelismReport() = default;
VL_UNCOPYABLE(GraphAlgParallelismReport);
VL_UNMOVABLE(GraphAlgParallelismReport);
public:
static V3Graph::ParallelismReport
apply(const V3Graph& graph, std::function<uint32_t(const V3GraphVertex*)> vertexCost) {
return GraphAlgParallelismReport(graph, vertexCost).m_report;
}
};
V3Graph::ParallelismReport
V3Graph::parallelismReport(std::function<uint32_t(const V3GraphVertex*)> vertexCost) const {
return GraphAlgParallelismReport::apply(*this, vertexCost);
}

View File

@ -176,6 +176,7 @@ using EdgeHeap = PairingHeap<EdgeKey>;
// LogicMTask
class LogicMTask final : public AbstractLogicMTask {
VL_RTTI_IMPL(LogicMTask, AbstractLogicMTask)
template <GraphWay::en T_Way>
friend class PartPropagateCp;
@ -730,87 +731,6 @@ void MergeCandidate::rescore() {
}
}
//######################################################################
// PartParallelismEst - Estimate parallelism of graph
class PartParallelismEst final {
// MEMBERS
const V3Graph* const m_graphp; // Mtask-containing graph
// Total cost of evaluating the whole graph.
// The ratio of m_totalGraphCost to longestCpCost gives us an estimate
// of the parallelizability of this graph which is only as good as the
// guess returned by LogicMTask::cost().
uint32_t m_totalGraphCost = 0;
// Cost of the longest critical path, in abstract units (the same units
// returned by the vertexCost)
uint32_t m_longestCpCost = 0;
size_t m_vertexCount = 0; // Number of vertexes calculated
size_t m_edgeCount = 0; // Number of edges calculated
public:
// CONSTRUCTORS
explicit PartParallelismEst(const V3Graph* graphp)
: m_graphp{graphp} {}
// METHODS
uint32_t totalGraphCost() const { return m_totalGraphCost; }
uint32_t longestCritPathCost() const { return m_longestCpCost; }
size_t vertexCount() const { return m_vertexCount; }
size_t edgeCount() const { return m_edgeCount; }
double parallelismFactor() const {
return (static_cast<double>(m_totalGraphCost) / m_longestCpCost);
}
void traverse() {
// For each node, record the critical path cost from the start
// of the graph through the end of the node.
std::unordered_map<const V3GraphVertex*, uint32_t> critPaths;
GraphStreamUnordered serialize{m_graphp};
for (const V3GraphVertex* vertexp; (vertexp = serialize.nextp());) {
++m_vertexCount;
uint32_t cpCostToHere = 0;
for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep; edgep = edgep->inNextp()) {
++m_edgeCount;
// For each upstream item, add its critical path cost to
// the cost of this edge, to form a new candidate critical
// path cost to the current node. Whichever is largest is
// the critical path to reach the start of this node.
cpCostToHere = std::max(cpCostToHere, critPaths[edgep->fromp()]);
}
// Include the cost of the current vertex in the critical
// path, so it represents the critical path to the end of
// this vertex.
cpCostToHere += vertexCost(vertexp);
critPaths[vertexp] = cpCostToHere;
m_longestCpCost = std::max(m_longestCpCost, cpCostToHere);
// Tally the total cost contributed by vertices.
m_totalGraphCost += vertexCost(vertexp);
}
}
void statsReport(const string& stage) const {
V3Stats::addStat("MTask graph, " + stage + ", critical path cost", m_longestCpCost);
V3Stats::addStat("MTask graph, " + stage + ", total graph cost", m_totalGraphCost);
V3Stats::addStat("MTask graph, " + stage + ", mtask count", m_vertexCount);
V3Stats::addStat("MTask graph, " + stage + ", edge count", m_edgeCount);
V3Stats::addStat("MTask graph, " + stage + ", parallelism factor", parallelismFactor());
}
void debugReport() const {
UINFO(0, " Critical path cost = " << m_longestCpCost << endl);
UINFO(0, " Total graph cost = " << m_totalGraphCost << endl);
UINFO(0, " MTask vertex count = " << m_vertexCount << endl);
UINFO(0, " Edge count = " << m_edgeCount << endl);
UINFO(0, " Parallelism factor = " << parallelismFactor() << endl);
}
static uint32_t vertexCost(const V3GraphVertex* vertexp) {
return vertexp->as<const AbstractMTask>()->cost();
}
private:
VL_UNCOPYABLE(PartParallelismEst);
};
//######################################################################
// Look at vertex costs (in one way) to form critical paths for each
@ -1758,20 +1678,13 @@ private:
chain_len * 2, nullptr, nullptr, false /* slowAsserts */};
ec.go();
PartParallelismEst check{&mtasks};
check.traverse();
// All vertices should merge into one
UASSERT_SELFTEST(
bool, mtasks.verticesBeginp() && !mtasks.verticesBeginp()->verticesNextp(), true);
const uint64_t endUsecs = V3Os::timeUsecs();
const uint64_t elapsedUsecs = endUsecs - startUsecs;
if (debug() >= 6) {
UINFO(0, "Chain self test stats:\n");
check.debugReport();
UINFO(0, "Elapsed usecs = " << elapsedUsecs << "\n");
}
// All vertices should merge into one
UASSERT_SELFTEST(size_t, check.vertexCount(), 1);
return elapsedUsecs;
}
@ -1811,20 +1724,15 @@ private:
partInitCriticalPaths(&mtasks);
PartContraction{&mtasks, 20, nullptr, nullptr, true}.go();
PartParallelismEst check{&mtasks};
check.traverse();
const auto report = mtasks.parallelismReport(
[](const V3GraphVertex* vtxp) { return vtxp->as<const LogicMTask>()->cost(); });
// Checking exact values here is maybe overly precise. What we're
// mostly looking for is a healthy reduction in the number of
// mtasks.
if (debug() >= 5) {
UINFO(0, "X self test stats:\n");
check.debugReport();
}
UASSERT_SELFTEST(uint32_t, check.longestCritPathCost(), 19);
UASSERT_SELFTEST(uint32_t, check.totalGraphCost(), 101);
UASSERT_SELFTEST(uint32_t, check.vertexCount(), 14);
UASSERT_SELFTEST(uint32_t, check.edgeCount(), 13);
// mostly looking for is a healthy reduction in the number of mtasks.
UASSERT_SELFTEST(uint32_t, report.criticalPathCost(), 19);
UASSERT_SELFTEST(uint32_t, report.totalGraphCost(), 101);
UASSERT_SELFTEST(uint32_t, report.vertexCount(), 14);
UASSERT_SELFTEST(uint32_t, report.edgeCount(), 13);
}
public:
@ -2581,13 +2489,21 @@ void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stag
// Look only at the cost of each mtask, neglect communication cost.
// This will show us how much parallelism we expect, assuming cache-miss
// costs are minor and the cost of running logic is the dominant cost.
PartParallelismEst vertexParEst{graphp};
vertexParEst.traverse();
vertexParEst.statsReport(stage);
const auto report = graphp->parallelismReport(
[](const V3GraphVertex* vtxp) { return vtxp->as<const AbstractMTask>()->cost(); });
V3Stats::addStat("MTask graph, " + stage + ", critical path cost", report.criticalPathCost());
V3Stats::addStat("MTask graph, " + stage + ", total graph cost", report.totalGraphCost());
V3Stats::addStat("MTask graph, " + stage + ", mtask count", report.vertexCount());
V3Stats::addStat("MTask graph, " + stage + ", edge count", report.edgeCount());
V3Stats::addStat("MTask graph, " + stage + ", parallelism factor", report.parallelismFactor());
if (debug() >= 4) {
UINFO(0, "\n");
UINFO(0, " Parallelism estimate for based on mtask costs:\n");
vertexParEst.debugReport();
UINFO(0, " MTask Parallelism estimate based costs at stage" << stage << ":\n");
UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n");
UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n");
UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n");
UINFO(0, " Edge count = " << report.edgeCount() << "\n");
UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n");
}
}
@ -3041,13 +2957,21 @@ static void finalizeCosts(V3Graph* execMTaskGraphp) {
}
// Record summary stats for final m_tasks graph.
// (More verbose stats are available with --debugi-V3Partition >= 3.)
PartParallelismEst parEst{execMTaskGraphp};
parEst.traverse();
parEst.statsReport("final");
const auto report = execMTaskGraphp->parallelismReport(
[](const V3GraphVertex* vtxp) { return vtxp->as<const ExecMTask>()->cost(); });
V3Stats::addStat("MTask graph, final, critical path cost", report.criticalPathCost());
V3Stats::addStat("MTask graph, final, total graph cost", report.totalGraphCost());
V3Stats::addStat("MTask graph, final, mtask count", report.vertexCount());
V3Stats::addStat("MTask graph, final, edge count", report.edgeCount());
V3Stats::addStat("MTask graph, final, parallelism factor", report.parallelismFactor());
if (debug() >= 3) {
UINFO(0, " Final mtask parallelism report:\n");
parEst.debugReport();
UINFO(0, "\n");
UINFO(0, " Final mtask parallelism report:\n");
UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n");
UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n");
UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n");
UINFO(0, " Edge count = " << report.edgeCount() << "\n");
UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n");
}
}