V3Partition: Reduce working set size of PartContraction (#3587)

This yields an additional 25% speedup of MT scheduling.
This commit is contained in:
Geza Lore 2022-09-01 17:29:40 +02:00 committed by GitHub
parent 849bb5590a
commit 875361d7ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 95 additions and 40 deletions

View File

@ -1334,37 +1334,38 @@ void OrderProcess::processMTasks() {
const V3GraphVertex* moveVxp;
while ((moveVxp = emit_logic.nextp())) {
const MTaskMoveVertex* const movep = dynamic_cast<const MTaskMoveVertex*>(moveVxp);
// Only care about logic vertices
if (!movep->logicp()) continue;
const unsigned mtaskId = movep->color();
UASSERT(mtaskId > 0, "Every MTaskMoveVertex should have an mtask assignment >0");
if (movep->logicp()) {
// Add this logic to the per-mtask order
mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
// Since we happen to be iterating over every logic node,
// take this opportunity to annotate each AstVar with the id's
// of mtasks that consume it and produce it. We'll use this
// information in V3EmitC when we lay out var's in memory.
const OrderLogicVertex* const logicp = movep->logicp();
for (const V3GraphEdge* edgep = logicp->inBeginp(); edgep; edgep = edgep->inNextp()) {
const OrderVarVertex* const pre_varp
= dynamic_cast<const OrderVarVertex*>(edgep->fromp());
if (!pre_varp) continue;
AstVar* const varp = pre_varp->vscp()->varp();
// varp depends on logicp, so logicp produces varp,
// and vice-versa below
varp->addProducingMTaskId(mtaskId);
}
for (const V3GraphEdge* edgep = logicp->outBeginp(); edgep;
edgep = edgep->outNextp()) {
const OrderVarVertex* const post_varp
= dynamic_cast<const OrderVarVertex*>(edgep->top());
if (!post_varp) continue;
AstVar* const varp = post_varp->vscp()->varp();
varp->addConsumingMTaskId(mtaskId);
}
// TODO? We ignore IO vars here, so those will have empty mtask
// signatures. But we could also give those mtask signatures.
// Add this logic to the per-mtask order
mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
// Since we happen to be iterating over every logic node,
// take this opportunity to annotate each AstVar with the id's
// of mtasks that consume it and produce it. We'll use this
// information in V3EmitC when we lay out var's in memory.
const OrderLogicVertex* const logicp = movep->logicp();
for (const V3GraphEdge* edgep = logicp->inBeginp(); edgep; edgep = edgep->inNextp()) {
const OrderVarVertex* const pre_varp
= dynamic_cast<const OrderVarVertex*>(edgep->fromp());
if (!pre_varp) continue;
AstVar* const varp = pre_varp->vscp()->varp();
// varp depends on logicp, so logicp produces varp,
// and vice-versa below
varp->addProducingMTaskId(mtaskId);
}
for (const V3GraphEdge* edgep = logicp->outBeginp(); edgep; edgep = edgep->outNextp()) {
const OrderVarVertex* const post_varp
= dynamic_cast<const OrderVarVertex*>(edgep->top());
if (!post_varp) continue;
AstVar* const varp = post_varp->vscp()->varp();
varp->addConsumingMTaskId(mtaskId);
}
// TODO? We ignore IO vars here, so those will have empty mtask
// signatures. But we could also give those mtask signatures.
}
// Create the AstExecGraph node which represents the execution

View File

@ -2611,6 +2611,42 @@ void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) {
UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl);
}
// Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask
// graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph of:
// - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex
// (MTaskMoveVertex::logicp() != nullptr)
// - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair
// Our goal is to order the logic vertices. The second type of variable/domain vertices only carry
// dependencies and are eventually discarded. In order to reduce the working set size of
// PartContraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, and
// instead add the transitive dependencies directly, but only if adding the transitive edges
// directly does not require more dependency edges than keeping the intermediate vertex. That is,
// we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be true if fanIn
// or fanOut are 1, or if they are both 2. This can cause significant reduction in working set
// size.
static bool bypassOk(MTaskMoveVertex* mvtxp) {
// Need to keep all logic vertices
if (mvtxp->logicp()) return false;
// Count fan-in, up to 3
unsigned fanIn = 0;
for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) {
if (++fanIn == 3) break;
}
UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn"););
// If fanInn no more than one, bypass
if (fanIn <= 1) return true;
// Count fan-out, up to 3
unsigned fanOut = 0;
for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) {
if (++fanOut == 3) break;
}
UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut"););
// If fan-out no more than one, bypass
if (fanOut <= 1) return true;
// They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2)
return fanIn + fanOut == 4;
}
uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
uint32_t totalGraphCost = 0;
@ -2627,9 +2663,13 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
for (V3GraphVertex *vtxp = m_fineDepsGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) {
nextp = vtxp->verticesNextp();
MTaskMoveVertex* const mVtxp = static_cast<MTaskMoveVertex*>(vtxp);
LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp};
mVtxp->userp(mtaskp);
totalGraphCost += mtaskp->cost();
if (bypassOk(mVtxp)) {
mVtxp->userp(nullptr); // Set to nullptr to mark as bypassed
} else {
LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp};
mVtxp->userp(mtaskp);
totalGraphCost += mtaskp->cost();
}
}
// Artificial single exit point vertex in the MTask graph to allow sibling merges.
@ -2647,20 +2687,34 @@ uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) {
// At this point, there should only be one MTaskMoveVertex per LogicMTask
UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex");
MTaskMoveVertex* const mvtxp = mtaskp->vertexListp()->front();
for (V3GraphEdge* outp = mvtxp->outBeginp(); outp; outp = outp->outNextp()) {
UASSERT(outp->weight() > 0, "Dependency with 0 weight in Move graph");
UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask");
// Grab the opposite end MTask.
LogicMTask* const otherp = static_cast<LogicMTask*>(outp->top()->userp());
// Function to add a edge to a dependent from 'mtaskp'
const auto addEdge = [mtasksp, mtaskp](LogicMTask* otherp) {
UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge");
// Don't create redundant edges.
if (mtaskp->hasRelativeMTask(otherp)) continue;
// Add the MTask->MTask dependency edge
if (mtaskp->hasRelativeMTask(otherp)) return; // Don't create redundant edges.
new MTaskEdge{mtasksp, mtaskp, otherp, 1};
};
// Iterate downstream direct dependents
for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) {
dNextp = dEdgep->outNextp();
V3GraphVertex* const top = dEdgep->top();
if (LogicMTask* const otherp = static_cast<LogicMTask*>(top->userp())) {
// The opposite end of the edge is not a bypassed vertex, add as direct dependent
addEdge(otherp);
} else {
// The opposite end of the edge is a bypassed vertex, add transitive dependents
for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; tEdgep = tNextp) {
tNextp = tEdgep->outNextp();
LogicMTask* const transp = static_cast<LogicMTask*>(tEdgep->top()->userp());
// The Move graph is bipartite (logic <-> var), and logic is never bypassed,
// hence 'transp' must be non nullptr.
UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex");
addEdge(transp);
}
}
}
}