diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0dd9d1991..146dd611a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -79,6 +79,7 @@ set(HEADERS V3EmitV.h V3EmitXml.h V3Error.h + V3ExecGraph.h V3Expand.h V3File.h V3FileLine.h @@ -130,7 +131,6 @@ set(HEADERS V3Parse.h V3ParseImp.h V3ParseSym.h - V3Partition.h V3PartitionGraph.h V3PchAstMT.h V3PchAstNoMT.h @@ -240,6 +240,7 @@ set(COMMON_SOURCES V3EmitV.cpp V3EmitXml.cpp V3Error.cpp + V3ExecGraph.cpp V3Expand.cpp V3File.cpp V3FileLine.cpp @@ -282,7 +283,6 @@ set(COMMON_SOURCES V3OrderSerial.cpp V3Os.cpp V3Param.cpp - V3Partition.cpp V3PreShell.cpp V3Premit.cpp V3ProtectLib.cpp diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index 8df1de31f..fb9306a4e 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -242,6 +242,7 @@ RAW_OBJS_PCH_ASTNOMT = \ V3EmitCSyms.o \ V3EmitMk.o \ V3EmitXml.o \ + V3ExecGraph.o \ V3Expand.o \ V3Force.o \ V3Fork.o \ @@ -270,7 +271,6 @@ RAW_OBJS_PCH_ASTNOMT = \ V3OrderProcessDomains.o \ V3OrderSerial.o \ V3Param.o \ - V3Partition.o \ V3Premit.o \ V3ProtectLib.o \ V3Randomize.o \ diff --git a/src/V3ExecGraph.cpp b/src/V3ExecGraph.cpp new file mode 100644 index 000000000..74fa39769 --- /dev/null +++ b/src/V3ExecGraph.cpp @@ -0,0 +1,850 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: AstExecGraph code construction +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* + +#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT + +#include "V3ExecGraph.h" + +#include "V3Config.h" +#include "V3EmitCBase.h" +#include "V3File.h" +#include "V3GraphStream.h" +#include "V3InstrCount.h" +#include "V3Os.h" +#include "V3PartitionGraph.h" +#include "V3Stats.h" +#include "V3UniqueNames.h" + +#include +#include +#include + +VL_DEFINE_DEBUG_FUNCTIONS; + +namespace V3ExecGraph { + +//###################################################################### +// ThreadSchedule + +// The thread schedule, containing all information needed later. Note that this is a simple +// aggregate data type and the only way to get hold of an instance of it is via +// PackThreads::pack, which is moved from there and is const, which means we can only acquire a +// const reference to is so no further modifications are allowed, so all members are public +// (attributes). +class ThreadSchedule final { + friend class PackThreads; + +public: + // CONSTANTS + static constexpr uint32_t UNASSIGNED = 0xffffffff; + + // TYPES + struct MTaskState final { + uint32_t completionTime = 0; // Estimated time this mtask will complete + uint32_t threadId = UNASSIGNED; // Thread id this MTask is assigned to + const ExecMTask* nextp = nullptr; // Next MTask on same thread after this + }; + + // MEMBERS + // Allocation of sequence of MTasks to threads. Can be considered a map from thread ID to + // the sequence of MTasks to be executed by that thread. + std::vector> threads; + + // State for each mtask. + std::unordered_map mtaskState; + + uint32_t threadId(const ExecMTask* mtaskp) const { + const auto& it = mtaskState.find(mtaskp); + return it != mtaskState.end() ? it->second.threadId : UNASSIGNED; + } + +private: + explicit ThreadSchedule(uint32_t nThreads) + : threads{nThreads} {} + VL_UNCOPYABLE(ThreadSchedule); // But movable + ThreadSchedule(ThreadSchedule&&) = default; + ThreadSchedule& operator=(ThreadSchedule&&) = default; + + // Debugging + void dumpDotFile(const V3Graph& graph, const string& filename) const { + // This generates a file used by graphviz, https://www.graphviz.org + const std::unique_ptr logp{V3File::new_ofstream(filename)}; + if (logp->fail()) v3fatal("Can't write " << filename); + + // Header + *logp << "digraph v3graph {\n"; + *logp << " graph[layout=\"neato\" labelloc=t labeljust=l label=\"" << filename << "\"]\n"; + *logp << " node[shape=\"rect\" ratio=\"fill\" fixedsize=true]\n"; + + // Thread labels + *logp << "\n // Threads\n"; + const int threadBoxWidth = 2; + for (int i = 0; i < v3Global.opt.threads(); i++) { + *logp << " t" << i << " [label=\"Thread " << i << "\" width=" << threadBoxWidth + << " pos=\"" << (-threadBoxWidth / 2) << "," << -i + << "!\" style=\"filled\" fillcolor=\"grey\"] \n"; + } + + // MTask nodes + *logp << "\n // MTasks\n"; + + // Find minimum cost MTask for scaling MTask node widths + uint32_t minCost = UINT32_MAX; + for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + if (const ExecMTask* const mtaskp = vxp->cast()) { + minCost = minCost > mtaskp->cost() ? mtaskp->cost() : minCost; + } + } + const double minWidth = 2.0; + const auto mtaskXPos = [&](const ExecMTask* mtaskp, const double nodeWidth) { + const double startPosX = (minWidth * startTime(mtaskp)) / minCost; + return nodeWidth / minWidth + startPosX; + }; + + const auto emitMTask = [&](const ExecMTask* mtaskp) { + const int thread = threadId(mtaskp); + const double nodeWidth = minWidth * (static_cast(mtaskp->cost()) / minCost); + const double x = mtaskXPos(mtaskp, nodeWidth); + const int y = -thread; + const string label = "label=\"" + mtaskp->name() + " (" + cvtToStr(startTime(mtaskp)) + + ":" + std::to_string(endTime(mtaskp)) + ")" + "\""; + *logp << " " << mtaskp->name() << " [" << label << " width=" << nodeWidth << " pos=\"" + << x << "," << y << "!\"]\n"; + }; + + // Emit MTasks + for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + if (const ExecMTask* const mtaskp = vxp->cast()) emitMTask(mtaskp); + } + + // Emit MTask dependency edges + *logp << "\n // MTask dependencies\n"; + for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + if (const ExecMTask* const mtaskp = vxp->cast()) { + for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) { + const V3GraphVertex* const top = edgep->top(); + *logp << " " << vxp->name() << " -> " << top->name() << "\n"; + } + } + } + + // Trailer + *logp << "}\n"; + logp->close(); + } + + // Variant of dumpDotFilePrefixed without --dump option check + void dumpDotFilePrefixedAlways(const V3Graph& graph, const string& nameComment) const { + dumpDotFile(graph, v3Global.debugFilename(nameComment) + ".dot"); + } + +public: + // Returns the number of cross-thread dependencies of the given MTask. If > 0, the MTask must + // test whether its dependencies are ready before starting, and therefore may need to block. + uint32_t crossThreadDependencies(const ExecMTask* mtaskp) const { + const uint32_t thisThreadId = threadId(mtaskp); + uint32_t result = 0; + for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) { + const ExecMTask* const prevp = edgep->fromp()->as(); + if (threadId(prevp) != thisThreadId) ++result; + } + return result; + } + + uint32_t startTime(const ExecMTask* mtaskp) const { + return mtaskState.at(mtaskp).completionTime - mtaskp->cost(); + } + uint32_t endTime(const ExecMTask* mtaskp) const { + return mtaskState.at(mtaskp).completionTime; + } +}; + +//###################################################################### +// PackThreads + +// Statically pack tasks into threads. +// +// The simplest thing that could possibly work would be to assume that our +// predictions of task runtimes are precise, and that every thread will +// make progress at an equal rate. Simulate a single "clock", pack the the +// highest priority ready task into whatever thread becomes ready earliest, +// repeating until no tasks remain. +// +// That doesn't work well, as our predictions of task runtimes have wide +// error bars (+/- 60% is typical.) +// +// So be a little more clever: let each task have a different end time, +// depending on which thread is looking. Be a little bit pessimistic when +// thread A checks the end time of an mtask running on thread B. This extra +// "padding" avoids tight "layovers" at cross-thread dependencies. +class PackThreads final { + // TYPES + struct MTaskCmp final { + bool operator()(const ExecMTask* ap, const ExecMTask* bp) const { + return ap->id() < bp->id(); + } + }; + + // MEMBERS + const uint32_t m_nThreads; // Number of threads + const uint32_t m_sandbagNumerator; // Numerator padding for est runtime + const uint32_t m_sandbagDenom; // Denominator padding for est runtime + +public: + // CONSTRUCTORS + explicit PackThreads(uint32_t nThreads = v3Global.opt.threads(), + unsigned sandbagNumerator = 30, unsigned sandbagDenom = 100) + : m_nThreads{nThreads} + , m_sandbagNumerator{sandbagNumerator} + , m_sandbagDenom{sandbagDenom} {} + ~PackThreads() = default; + +private: + // METHODS + uint32_t completionTime(const ThreadSchedule& schedule, const ExecMTask* mtaskp, + uint32_t threadId) { + const ThreadSchedule::MTaskState& state = schedule.mtaskState.at(mtaskp); + UASSERT(state.threadId != ThreadSchedule::UNASSIGNED, "Mtask should have assigned thread"); + if (threadId == state.threadId) { + // No overhead on same thread + return state.completionTime; + } + + // Add some padding to the estimated runtime when looking from + // another thread + uint32_t sandbaggedEndTime + = state.completionTime + (m_sandbagNumerator * mtaskp->cost()) / m_sandbagDenom; + + // If task B is packed after task A on thread 0, don't let thread 1 + // think that A finishes earlier than thread 0 thinks that B + // finishes, otherwise we get priority inversions and fail the self + // test. + if (state.nextp) { + const uint32_t successorEndTime + = completionTime(schedule, state.nextp, state.threadId); + if ((sandbaggedEndTime >= successorEndTime) && (successorEndTime > 1)) { + sandbaggedEndTime = successorEndTime - 1; + } + } + + UINFO(6, "Sandbagged end time for " << mtaskp->name() << " on th " << threadId << " = " + << sandbaggedEndTime << endl); + return sandbaggedEndTime; + } + + bool isReady(ThreadSchedule& schedule, const ExecMTask* mtaskp) { + for (V3GraphEdge* edgeInp = mtaskp->inBeginp(); edgeInp; edgeInp = edgeInp->inNextp()) { + const ExecMTask* const prevp = edgeInp->fromp()->as(); + if (schedule.threadId(prevp) == ThreadSchedule::UNASSIGNED) { + // This predecessor is not assigned yet + return false; + } + } + return true; + } + +public: + // Pack an MTasks from given graph into m_nThreads threads, return the schedule. + const ThreadSchedule pack(const V3Graph& mtaskGraph) { + // The result + ThreadSchedule schedule{m_nThreads}; + + // Time each thread is occupied until + std::vector busyUntil(m_nThreads, 0); + + // MTasks ready to be assigned next. All their dependencies are already assigned. + std::set readyMTasks; + + // Build initial ready list + for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + ExecMTask* const mtaskp = vxp->as(); + if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp); + } + + while (!readyMTasks.empty()) { + // For each task in the ready set, compute when it might start + // on each thread (in that thread's local time frame.) + uint32_t bestTime = 0xffffffff; + uint32_t bestThreadId = 0; + ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask* + for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) { + for (ExecMTask* const mtaskp : readyMTasks) { + uint32_t timeBegin = busyUntil[threadId]; + if (timeBegin > bestTime) { + UINFO(6, "th " << threadId << " busy until " << timeBegin + << ", later than bestTime " << bestTime + << ", skipping thread.\n"); + break; + } + for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; + edgep = edgep->inNextp()) { + const ExecMTask* const priorp = edgep->fromp()->as(); + const uint32_t priorEndTime = completionTime(schedule, priorp, threadId); + if (priorEndTime > timeBegin) timeBegin = priorEndTime; + } + UINFO(6, "Task " << mtaskp->name() << " start at " << timeBegin + << " on thread " << threadId << endl); + if ((timeBegin < bestTime) + || ((timeBegin == bestTime) + && bestMtaskp // Redundant, but appeases static analysis tools + && (mtaskp->priority() > bestMtaskp->priority()))) { + bestTime = timeBegin; + bestThreadId = threadId; + bestMtaskp = mtaskp; + } + } + } + + UASSERT(bestMtaskp, "Should have found some task"); + UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << bestThreadId + << endl); + + // Reference to thread in schedule we are assigning this MTask to. + std::vector& bestThread = schedule.threads[bestThreadId]; + + // Update algorithm state + bestMtaskp->predictStart(bestTime); // Only for gantt reporting + const uint32_t bestEndTime = bestTime + bestMtaskp->cost(); + schedule.mtaskState[bestMtaskp].completionTime = bestEndTime; + schedule.mtaskState[bestMtaskp].threadId = bestThreadId; + if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; + busyUntil[bestThreadId] = bestEndTime; + + // Add the MTask to the schedule + bestThread.push_back(bestMtaskp); + + // Update the ready list + const size_t erased = readyMTasks.erase(bestMtaskp); + UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?"); + for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp; + edgeOutp = edgeOutp->outNextp()) { + ExecMTask* const nextp = edgeOutp->top()->as(); + // Dependent MTask should not yet be assigned to a thread + UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED, + "Tasks after one being assigned should not be assigned yet"); + // Dependent MTask should not be ready yet, since dependency is just being assigned + UASSERT_OBJ(readyMTasks.find(nextp) == readyMTasks.end(), nextp, + "Tasks after one being assigned should not be ready"); + if (isReady(schedule, nextp)) { + readyMTasks.insert(nextp); + UINFO(6, "Inserted " << nextp->name() << " into ready\n"); + } + } + } + + if (dumpGraphLevel() >= 4) schedule.dumpDotFilePrefixedAlways(mtaskGraph, "schedule"); + + return schedule; + } + + // SELF TEST + static void selfTest() { + V3Graph graph; + ExecMTask* const t0 = new ExecMTask{&graph, nullptr, 0}; + t0->cost(1000); + t0->priority(1100); + ExecMTask* const t1 = new ExecMTask{&graph, nullptr, 1}; + t1->cost(100); + t1->priority(100); + ExecMTask* const t2 = new ExecMTask{&graph, nullptr, 2}; + t2->cost(100); + t2->priority(100); + + new V3GraphEdge{&graph, t0, t1, 1}; + new V3GraphEdge{&graph, t0, t2, 1}; + + PackThreads packer{2, // Threads + 3, // Sandbag numerator + 10}; // Sandbag denom + const ThreadSchedule& schedule = packer.pack(graph); + + UASSERT_SELFTEST(size_t, schedule.threads.size(), 2); + + UASSERT_SELFTEST(size_t, schedule.threads[0].size(), 2); + UASSERT_SELFTEST(size_t, schedule.threads[1].size(), 1); + + UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][0], t0); + UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][1], t1); + UASSERT_SELFTEST(const ExecMTask*, schedule.threads[1][0], t2); + + UASSERT_SELFTEST(size_t, schedule.mtaskState.size(), 3); + + UASSERT_SELFTEST(uint32_t, schedule.threadId(t0), 0); + UASSERT_SELFTEST(uint32_t, schedule.threadId(t1), 0); + UASSERT_SELFTEST(uint32_t, schedule.threadId(t2), 1); + + // On its native thread, we see the actual end time for t0: + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 0), 1000); + // On the other thread, we see a sandbagged end time which does not + // exceed the t1 end time: + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 1), 1099); + + // Actual end time on native thread: + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 0), 1100); + // Sandbagged end time seen on thread 1. Note it does not compound + // with t0's sandbagged time; compounding caused trouble in + // practice. + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 1), 1130); + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 0), 1229); + UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 1), 1199); + } + +private: + VL_UNCOPYABLE(PackThreads); +}; + +using EstimateAndProfiled = std::pair; // cost est, cost profiled +using Costs = std::unordered_map; + +void normalizeCosts(Costs& costs) { + const auto scaleCost = [](uint64_t value, double multiplier) { + double scaled = static_cast(value) * multiplier; + if (value && scaled < 1) scaled = 1; + return static_cast(scaled); + }; + + // For all costs with a profile, compute sum + uint64_t sumCostProfiled = 0; // For data with estimate and profile + uint64_t sumCostEstimate = 0; // For data with estimate and profile + for (const auto& est : costs) { + if (est.second.second) { + sumCostEstimate += est.second.first; + sumCostProfiled += est.second.second; + } + } + + if (sumCostEstimate) { + // For data where we don't have profiled data, compute how much to + // scale up/down the estimate to make on same relative scale as + // profiled data. (Improves results if only a few profiles missing.) + const double estToProfile + = static_cast(sumCostProfiled) / static_cast(sumCostEstimate); + UINFO(5, "Estimated data needs scaling by " + << estToProfile << ", sumCostProfiled=" << sumCostProfiled + << " sumCostEstimate=" << sumCostEstimate << endl); + for (auto& est : costs) { + uint64_t& costEstimate = est.second.first; + costEstimate = scaleCost(costEstimate, estToProfile); + } + } + + // COSTS can overflow a uint32. Using maximum value of costs, scale all down + uint64_t maxCost = 0; + for (auto& est : costs) { + const uint64_t& costEstimate = est.second.first; + const uint64_t& costProfiled = est.second.second; + if (maxCost < costEstimate) maxCost = costEstimate; + if (maxCost < costProfiled) maxCost = costProfiled; + UINFO(9, + "Post uint scale: ce = " << est.second.first << " cp=" << est.second.second << endl); + } + const uint64_t scaleDownTo = 10000000; // Extra room for future algorithms to add costs + if (maxCost > scaleDownTo) { + const double scaleup = static_cast(scaleDownTo) / static_cast(maxCost); + UINFO(5, "Scaling data to within 32-bits by multiply by=" << scaleup << ", maxCost=" + << maxCost << endl); + for (auto& est : costs) { + est.second.first = scaleCost(est.second.first, scaleup); + est.second.second = scaleCost(est.second.second, scaleup); + } + } +} + +void fillinCosts(V3Graph* execMTaskGraphp) { + V3UniqueNames m_uniqueNames; // For generating unique mtask profile hash names + + // Pass 1: See what profiling data applies + Costs costs; // For each mtask, costs + + for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; + vxp = vxp->verticesNextp()) { + ExecMTask* const mtp = const_cast(vxp)->as(); + // Compute name of mtask, for hash lookup + mtp->hashName(m_uniqueNames.get(mtp->bodyp())); + + // This estimate is 64 bits, but the final mtask graph algorithm needs 32 bits + const uint64_t costEstimate = V3InstrCount::count(mtp->bodyp(), false); + const uint64_t costProfiled + = V3Config::getProfileData(v3Global.opt.prefix(), mtp->hashName()); + if (costProfiled) { + UINFO(5, "Profile data for mtask " << mtp->id() << " " << mtp->hashName() + << " cost override " << costProfiled << endl); + } + costs[mtp->id()] = std::make_pair(costEstimate, costProfiled); + } + + normalizeCosts(costs /*ref*/); + + int totalEstimates = 0; + int missingProfiles = 0; + for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; + vxp = vxp->verticesNextp()) { + ExecMTask* const mtp = const_cast(vxp)->as(); + const uint32_t costEstimate = costs[mtp->id()].first; + const uint64_t costProfiled = costs[mtp->id()].second; + UINFO(9, "ce = " << costEstimate << " cp=" << costProfiled << endl); + UASSERT(costEstimate <= (1UL << 31), "cost scaling math would overflow uint32"); + UASSERT(costProfiled <= (1UL << 31), "cost scaling math would overflow uint32"); + const uint64_t costProfiled32 = static_cast(costProfiled); + uint32_t costToUse = costProfiled32; + if (!costProfiled32) { + costToUse = costEstimate; + if (costEstimate != 0) ++missingProfiles; + } + if (costEstimate != 0) ++totalEstimates; + mtp->cost(costToUse); + mtp->priority(costToUse); + } + + if (missingProfiles) { + if (FileLine* const fl = V3Config::getProfileDataFileLine()) { + fl->v3warn(PROFOUTOFDATE, "Profile data for mtasks may be out of date. " + << missingProfiles << " of " << totalEstimates + << " mtasks had no data"); + } + } +} + +void finalizeCosts(V3Graph* execMTaskGraphp) { + GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE); + while (const V3GraphVertex* const vxp = ser.nextp()) { + ExecMTask* const mtp = const_cast(vxp)->as(); + // "Priority" is the critical path from the start of the mtask, to + // the end of the graph reachable from this mtask. Given the + // choice among several ready mtasks, we'll want to start the + // highest priority one first, so we're always working on the "long + // pole" + for (V3GraphEdge* edgep = mtp->outBeginp(); edgep; edgep = edgep->outNextp()) { + const ExecMTask* const followp = edgep->top()->as(); + if ((followp->priority() + mtp->cost()) > mtp->priority()) { + mtp->priority(followp->priority() + mtp->cost()); + } + } + } + + // Some MTasks may now have zero cost, eliminate those. + // (It's common for tasks to shrink to nothing when V3LifePost + // removes dly assignments.) + for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;) { + ExecMTask* const mtp = vxp->as(); + vxp = vxp->verticesNextp(); // Advance before delete + + // Don't rely on checking mtp->cost() == 0 to detect an empty task. + // Our cost-estimating logic is just an estimate. Instead, check + // the MTaskBody to see if it's empty. That's the source of truth. + AstMTaskBody* const bodyp = mtp->bodyp(); + if (!bodyp->stmtsp()) { // Kill this empty mtask + UINFO(6, "Removing zero-cost " << mtp->name() << endl); + for (V3GraphEdge* inp = mtp->inBeginp(); inp; inp = inp->inNextp()) { + for (V3GraphEdge* outp = mtp->outBeginp(); outp; outp = outp->outNextp()) { + new V3GraphEdge{execMTaskGraphp, inp->fromp(), outp->top(), 1}; + } + } + VL_DO_DANGLING(mtp->unlinkDelete(execMTaskGraphp), mtp); + // Also remove and delete the AstMTaskBody, otherwise it would + // keep a dangling pointer to the ExecMTask. + VL_DO_DANGLING(bodyp->unlinkFrBack()->deleteTree(), bodyp); + } + } + + // Assign profiler IDs + for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + static_cast(vxp)->profilerId(v3Global.rootp()->allocNextMTaskProfilingID()); + } + + // Removing tasks may cause edges that were formerly non-transitive to + // become transitive. Also we just created new edges around the removed + // tasks, which could be transitive. Prune out all transitive edges. + execMTaskGraphp->removeTransitiveEdges(); + + // Record summary stats for final m_tasks graph. + const auto report = execMTaskGraphp->parallelismReport( + [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); + V3Stats::addStat("MTask graph, final, critical path cost", report.criticalPathCost()); + V3Stats::addStat("MTask graph, final, total graph cost", report.totalGraphCost()); + V3Stats::addStat("MTask graph, final, mtask count", report.vertexCount()); + V3Stats::addStat("MTask graph, final, edge count", report.edgeCount()); + V3Stats::addStat("MTask graph, final, parallelism factor", report.parallelismFactor()); + if (debug() >= 3) { + UINFO(0, "\n"); + UINFO(0, " Final mtask parallelism report:\n"); + UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n"); + UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n"); + UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n"); + UINFO(0, " Edge count = " << report.edgeCount() << "\n"); + UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n"); + } +} + +void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId, AstCFunc* funcp, + const ExecMTask* mtaskp) { + AstNodeModule* const modp = v3Global.rootp()->topModulep(); + FileLine* const fl = modp->fileline(); + + // Helper function to make the code a bit more legible + const auto addStrStmt = [=](const string& stmt) -> void { // + funcp->addStmtsp(new AstCStmt{fl, stmt}); + }; + + if (const uint32_t nDependencies = schedule.crossThreadDependencies(mtaskp)) { + // This mtask has dependencies executed on another thread, so it may block. Create the task + // state variable and wait to be notified. + const string name = "__Vm_mtaskstate_" + cvtToStr(mtaskp->id()); + AstBasicDType* const mtaskStateDtypep + = v3Global.rootp()->typeTablep()->findBasicDType(fl, VBasicDTypeKwd::MTASKSTATE); + AstVar* const varp = new AstVar{fl, VVarType::MODULETEMP, name, mtaskStateDtypep}; + varp->valuep(new AstConst{fl, nDependencies}); + varp->protect(false); // Do not protect as we still have references in AstText + modp->addStmtsp(varp); + // For now, reference is still via text bashing + addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n"); + } + + if (v3Global.opt.profPgo()) { + // No lock around startCounter, as counter numbers are unique per thread + addStrStmt("vlSymsp->_vm_pgoProfiler.startCounter(" + std::to_string(mtaskp->profilerId()) + + ");\n"); + } + + // Move the actual body into this function + funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack()); + + if (v3Global.opt.profPgo()) { + // No lock around stopCounter, as counter numbers are unique per thread + addStrStmt("vlSymsp->_vm_pgoProfiler.stopCounter(" + std::to_string(mtaskp->profilerId()) + + ");\n"); + } + + // For any dependent mtask that's on another thread, signal one dependency completion. + for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) { + const ExecMTask* const nextp = edgep->top()->as(); + if (schedule.threadId(nextp) != threadId) { + addStrStmt("vlSelf->__Vm_mtaskstate_" + cvtToStr(nextp->id()) + + ".signalUpstreamDone(even_cycle);\n"); + } + } +} + +const std::vector createThreadFunctions(const ThreadSchedule& schedule, + const string& tag) { + AstNodeModule* const modp = v3Global.rootp()->topModulep(); + FileLine* const fl = modp->fileline(); + + std::vector funcps; + + // For each thread, create a function representing its entry point + for (const std::vector& thread : schedule.threads) { + if (thread.empty()) continue; + const uint32_t threadId = schedule.threadId(thread.front()); + const string name{"__Vthread__" + tag + "__" + cvtToStr(threadId)}; + AstCFunc* const funcp = new AstCFunc{fl, name, nullptr, "void"}; + modp->addStmtsp(funcp); + funcps.push_back(funcp); + funcp->isStatic(true); // Uses void self pointer, so static and hand rolled + funcp->isLoose(true); + funcp->entryPoint(true); + funcp->argTypes("void* voidSelf, bool even_cycle"); + + // Setup vlSelf an vlSyms + funcp->addStmtsp(new AstCStmt{fl, EmitCBase::voidSelfAssign(modp)}); + funcp->addStmtsp(new AstCStmt{fl, EmitCBase::symClassAssign()}); + + // Invoke each mtask scheduled to this thread from the thread function + for (const ExecMTask* const mtaskp : thread) { + addMTaskToFunction(schedule, threadId, funcp, mtaskp); + } + + // Unblock the fake "final" mtask when this thread is finished + funcp->addStmtsp(new AstCStmt{fl, "vlSelf->__Vm_mtaskstate_final__" + tag + + ".signalUpstreamDone(even_cycle);\n"}); + } + + // Create the fake "final" mtask state variable + AstBasicDType* const mtaskStateDtypep + = v3Global.rootp()->typeTablep()->findBasicDType(fl, VBasicDTypeKwd::MTASKSTATE); + AstVar* const varp + = new AstVar{fl, VVarType::MODULETEMP, "__Vm_mtaskstate_final__" + tag, mtaskStateDtypep}; + varp->valuep(new AstConst(fl, funcps.size())); + varp->protect(false); // Do not protect as we still have references in AstText + modp->addStmtsp(varp); + + return funcps; +} + +void addThreadStartToExecGraph(AstExecGraph* const execGraphp, + const std::vector& funcps) { + // FileLine used for constructing nodes below + FileLine* const fl = v3Global.rootp()->fileline(); + const string& tag = execGraphp->name(); + + // Add thread function invocations to execGraph + const auto addStrStmt = [=](const string& stmt) -> void { // + execGraphp->addStmtsp(new AstCStmt{fl, stmt}); + }; + const auto addTextStmt = [=](const string& text) -> void { + execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true}); + }; + + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n"); + } + + addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag + + ";\n"); + + const uint32_t last = funcps.size() - 1; + for (uint32_t i = 0; i <= last; ++i) { + AstCFunc* const funcp = funcps.at(i); + if (i != last) { + // The first N-1 will run on the thread pool. + addTextStmt("vlSymsp->__Vm_threadPoolp->workerp(" + cvtToStr(i) + ")->addTask("); + execGraphp->addStmtsp(new AstAddrOfCFunc{fl, funcp}); + addTextStmt(", vlSelf, vlSymsp->__Vm_even_cycle__" + tag + ");\n"); + } else { + // The last will run on the main thread. + AstCCall* const callp = new AstCCall{fl, funcp}; + callp->dtypeSetVoid(); + callp->argTypes("vlSelf, vlSymsp->__Vm_even_cycle__" + tag); + execGraphp->addStmtsp(callp->makeStmt()); + addStrStmt("Verilated::mtaskId(0);\n"); + } + } + + addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag + + ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n"); + + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n"); + } +} + +void wrapMTaskBodies(AstExecGraph* const execGraphp) { + FileLine* const flp = execGraphp->fileline(); + const string& tag = execGraphp->name(); + AstNodeModule* const modp = v3Global.rootp()->topModulep(); + + for (AstMTaskBody* mtaskBodyp = execGraphp->mTaskBodiesp(); mtaskBodyp; + mtaskBodyp = VN_AS(mtaskBodyp->nextp(), MTaskBody)) { + ExecMTask* const mtaskp = mtaskBodyp->execMTaskp(); + const std::string name = tag + "_mtask" + std::to_string(mtaskp->id()); + AstCFunc* const funcp = new AstCFunc{flp, name, nullptr}; + funcp->isLoose(true); + modp->addStmtsp(funcp); + + // Helper function to make the code a bit more legible + const auto addStrStmt = [=](const string& stmt) -> void { // + funcp->addStmtsp(new AstCStmt{flp, stmt}); + }; + + if (v3Global.opt.profExec()) { + const string& id = std::to_string(mtaskp->id()); + const string& predictStart = std::to_string(mtaskp->predictStart()); + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(" + id + ", " + predictStart + + ");\n"); + } + + // Set mtask ID in the run-time system + addStrStmt("Verilated::mtaskId(" + std::to_string(mtaskp->id()) + ");\n"); + + // Run body + funcp->addStmtsp(mtaskBodyp->stmtsp()->unlinkFrBackWithNext()); + + // Flush message queue + addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n"); + + if (v3Global.opt.profExec()) { + const string& id = std::to_string(mtaskp->id()); + const string& predictConst = std::to_string(mtaskp->cost()); + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + id + ", " + predictConst + + ");\n"); + } + + // AstMTask will simply contain a call + AstCCall* const callp = new AstCCall{flp, funcp}; + callp->selfPointer(VSelfPointerText{VSelfPointerText::This{}}); + callp->dtypeSetVoid(); + mtaskBodyp->addStmtsp(callp->makeStmt()); + } +} + +void implementExecGraph(AstExecGraph* const execGraphp) { + // Nothing to be done if there are no MTasks in the graph at all. + if (execGraphp->depGraphp()->empty()) return; + + // Schedule the mtasks: statically associate each mtask with a thread, + // and determine the order in which each thread will runs its mtasks. + const ThreadSchedule& schedule = PackThreads{}.pack(*execGraphp->depGraphp()); + + // Create a function to be run by each thread. Note this moves all AstMTaskBody nodes form the + // AstExecGrap into the AstCFunc created + const std::vector& funcps = createThreadFunctions(schedule, execGraphp->name()); + UASSERT(!funcps.empty(), "Non-empty ExecGraph yields no threads?"); + + // Start the thread functions at the point this AstExecGraph is located in the tree. + addThreadStartToExecGraph(execGraphp, funcps); +} + +void implement(AstNetlist* netlistp) { + // Called by Verilator top stage + netlistp->topModulep()->foreach([&](AstExecGraph* execGraphp) { + // Back in V3Order, we partitioned mtasks using provisional cost + // estimates. However, V3Order precedes some optimizations (notably + // V3LifePost) that can change the cost of logic within each mtask. + // Now that logic is final, recompute the cost and priority of each + // ExecMTask. + fillinCosts(execGraphp->depGraphp()); + finalizeCosts(execGraphp->depGraphp()); + + // Wrap each MTask body into a CFunc for better profiling/debugging + wrapMTaskBodies(execGraphp); + + // Replace the graph body with its multi-threaded implementation. + implementExecGraph(execGraphp); + }); +} + +void selfTest() { + { // Test that omitted profile data correctly scales estimates + Costs costs({// id est prof + {1, {10, 1000}}, + {2, {20, 0}}, // Note no profile + {3, {30, 3000}}}); + normalizeCosts(costs); + UASSERT_SELFTEST(uint64_t, costs[1].first, 1000); + UASSERT_SELFTEST(uint64_t, costs[1].second, 1000); + UASSERT_SELFTEST(uint64_t, costs[2].first, 2000); + UASSERT_SELFTEST(uint64_t, costs[2].second, 0); + UASSERT_SELFTEST(uint64_t, costs[3].first, 3000); + UASSERT_SELFTEST(uint64_t, costs[3].second, 3000); + } + { // Test that very large profile data properly scales + Costs costs({// id est prof + {1, {10, 100000000000}}, + {2, {20, 200000000000}}, + {3, {30, 1}}}); // Make sure doesn't underflow + normalizeCosts(costs); + UASSERT_SELFTEST(uint64_t, costs[1].first, 2500000); + UASSERT_SELFTEST(uint64_t, costs[1].second, 5000000); + UASSERT_SELFTEST(uint64_t, costs[2].first, 5000000); + UASSERT_SELFTEST(uint64_t, costs[2].second, 10000000); + UASSERT_SELFTEST(uint64_t, costs[3].first, 7500000); + UASSERT_SELFTEST(uint64_t, costs[3].second, 1); + } + + PackThreads::selfTest(); +} + +} // namespace V3ExecGraph diff --git a/src/V3ExecGraph.h b/src/V3ExecGraph.h new file mode 100644 index 000000000..660276e76 --- /dev/null +++ b/src/V3ExecGraph.h @@ -0,0 +1,33 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: AstExecGraph code construction +// +// Code available from: https://verilator.org +// +//************************************************************************* +// +// Copyright 2003-2024 by Wilson Snyder. This program is free software; you +// can redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* + +#ifndef VERILATOR_V3EXECGRAPH_H_ +#define VERILATOR_V3EXECGRAPH_H_ + +#include "config_build.h" +#include "verilatedos.h" + +#include "V3ThreadSafety.h" + +class AstNetlist; + +namespace V3ExecGraph { +void implement(AstNetlist*) VL_MT_DISABLED; + +void selfTest() VL_MT_DISABLED; +} //namespace V3ExecGraph + +#endif // Guard diff --git a/src/V3Order.h b/src/V3Order.h index 0300256e4..65ec500b8 100644 --- a/src/V3Order.h +++ b/src/V3Order.h @@ -55,6 +55,8 @@ AstCFunc* order( const ExternalDomainsProvider& externalDomains = [](const AstVarScope*, std::vector&) {}) VL_MT_DISABLED; +void selfTestParallel(); + }; // namespace V3Order #endif // Guard diff --git a/src/V3OrderParallel.cpp b/src/V3OrderParallel.cpp index 49aa08f27..7eff76e0a 100644 --- a/src/V3OrderParallel.cpp +++ b/src/V3OrderParallel.cpp @@ -1,6 +1,6 @@ // -*- mode: C++; c-file-style: "cc-mode" -*- //************************************************************************* -// DESCRIPTION: Verilator: Block code ordering +// DESCRIPTION: Verilator: Multi-threaded code partitioning and ordering // // Code available from: https://verilator.org // @@ -20,20 +20,2457 @@ #include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT +#include "V3Config.h" +#include "V3File.h" #include "V3Graph.h" #include "V3GraphStream.h" +#include "V3InstrCount.h" #include "V3List.h" #include "V3OrderCFuncEmitter.h" #include "V3OrderInternal.h" #include "V3OrderMoveGraphBuilder.h" -#include "V3Partition.h" +#include "V3Os.h" +#include "V3PairingHeap.h" #include "V3PartitionGraph.h" +#include "V3Scoreboard.h" +#include "V3Stats.h" +#include +#include +#include +#include #include +#include #include VL_DEFINE_DEBUG_FUNCTIONS; +class LogicMTask; +class MTaskEdge; +class MergeCandidate; +class SiblingMC; + +// Similar to OrderMoveVertex, but modified for threaded code generation. +class MTaskMoveVertex final : public V3GraphVertex { + VL_RTTI_IMPL(MTaskMoveVertex, V3GraphVertex) + // This could be more compact, since we know m_varp and m_logicp + // cannot both be set. Each MTaskMoveVertex represents a logic node + // or a var node, it can't be both. + OrderLogicVertex* const m_logicp; // Logic represented by this vertex + const AstSenTree* const m_domainp; + +public: + MTaskMoveVertex(V3Graph& graph, OrderLogicVertex* logicp, + const AstSenTree* domainp) VL_MT_DISABLED : V3GraphVertex{&graph}, + m_logicp{logicp}, + m_domainp{domainp} {} + ~MTaskMoveVertex() override = default; + + // ACCESSORS + OrderLogicVertex* logicp() const { return m_logicp; } + const AstScope* scopep() const { return m_logicp ? m_logicp->scopep() : nullptr; } + const AstSenTree* domainp() const { return m_domainp; } + + string dotColor() const override { + if (logicp()) { + return logicp()->dotColor(); + } else { + return "yellow"; + } + } + string name() const override { + string nm; + if (logicp()) { + nm = logicp()->name(); + nm += (string{"\\nMV:"} + " d=" + cvtToHex(logicp()->domainp()) + " s=" + + cvtToHex(logicp()->scopep()) + // "color()" represents the mtask ID. + + "\\nt=" + cvtToStr(color())); + } else { + nm = "nolog\\nt=" + cvtToStr(color()); + } + return nm; + } +}; + +//************************************************************************* +// V3Partition takes the fine-grained logic graph from V3Order and +// collapses it into a coarse-grained graph of AbstractLogicMTask's, each +// of which contains of set of the logic nodes from the fine-grained +// graph. + +class V3Partition final { + // MEMBERS + const OrderGraph* const m_orderGraphp; // The OrderGraph + const V3Graph* const m_fineDepsGraphp; // Fine-grained dependency graph + + LogicMTask* m_entryMTaskp = nullptr; // Singular source vertex of the dependency graph + LogicMTask* m_exitMTaskp = nullptr; // Singular sink vertex of the dependency graph + +public: + // CONSTRUCTORS + explicit V3Partition(const OrderGraph* orderGraphp, const V3Graph* fineDepsGraphp) + : m_orderGraphp{orderGraphp} + , m_fineDepsGraphp{fineDepsGraphp} {} + ~V3Partition() = default; + + // METHODS + + // Fill in the provided empty graph with AbstractLogicMTask's and their + // interdependencies. + void go(V3Graph* mtasksp) VL_MT_DISABLED; + + // Print out a hash of the shape of graphp. Only needed to debug the + // origin of some nondeterminism; otherwise this is pretty useless. + static void hashGraphDebug(const V3Graph* graphp, const char* debugName) VL_MT_DISABLED; + + // Print debug stats about graphp whose nodes must be AbstractMTask's. + static void debugMTaskGraphStats(const V3Graph* graphp, const string& stage) VL_MT_DISABLED; + +private: + uint32_t setupMTaskDeps(V3Graph* mtasksp) VL_MT_DISABLED; + + VL_UNCOPYABLE(V3Partition); +}; + +// ###################################################################### +// Partitioner tunable settings: +// +// Before describing these settings, a bit of background: +// +// Early during the development of the partitioner, V3Split was failing to +// split large always blocks (with ~100K assignments) so we had to handle +// very large vertices with ~100K incoming and outgoing edges. +// +// The partitioner attempts to deal with such densely connected +// graphs. Some of the tuning parameters below reference "huge vertices", +// that's what they're talking about, vertices with tens of thousands of +// edges in and out. Whereas most graphs have only tens of edges in and out +// of most vertices. +// +// V3Split has since been fixed to more reliably split large always +// blocks. It's kind of an open question whether the partitioner must +// handle huge nodes gracefully. Maybe not! But it still can, given +// appropriate tuning. + +// PART_SIBLING_EDGE_LIMIT (integer) +// +// Arbitrarily limit the number of edges on a single vertex that will be +// considered when enumerating siblings, to the given value. This protects +// the partitioner runtime in the presence of huge vertices. +// +// The sibling-merge is less important than the edge merge. (You can +// totally disable the sibling merge and get halfway decent partitions; you +// can't disable edge merges, those are fundamental to the process.) So, +// skipping the enumeration of some siblings on a few vertices does not +// have a large impact on the result of the partitioner. +// +// If your vertices are small, the limit (at 26) approaches a no-op. Hence +// there's basically no cost to applying this limit even when we don't +// expect huge vertices. +// +// If you don't care about partitioner runtime and you want the most +// aggressive partition, set the limit very high. If you have huge +// vertices, leave this as is. +constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26; + +// PART_STEPPED_COST (defined/undef) +// +// When computing critical path costs, use a step function on the actual +// underlying vertex cost. +// +// If there are huge vertices, when a tiny vertex merges into a huge +// vertex, we can often avoid increasing the huge vertex's stepped cost. +// If the stepped cost hasn't increased, and the critical path into the huge +// vertex hasn't increased, we can avoid propagating a new critical path to +// vertices past the huge vertex. Since huge vertices tend to have huge lists +// of children and parents, this can be a substantial savings. +// +// Does not seem to reduce the quality of the partitioner's output. +// +// If you have huge vertices, leave this 'true', it is the major setting +// that allows the partitioner to handle such difficult graphs on anything +// like a human time scale. +// +// If you don't have huge vertices, the 'true' value doesn't help much but +// should cost almost nothing in terms of partitioner quality. +// +// If you want the most aggressive possible partition, set it "false" and +// be prepared to be disappointed when the improvement in the partition is +// negligible / in the noise. +// +// Q) Why retain the control, if there is really no downside? +// +// A) Cost stepping can lead to corner cases. A developer may wish to +// disable cost stepping to rule it out as the cause of unexpected +// behavior. +#define PART_STEPPED_COST true + +// Don't produce more than a certain maximum number of MTasks. This helps +// the TSP variable sort not to blow up (a concern for some of the tests) +// and we probably don't want a huge number of mtasks in practice anyway +// (50 to 100 is typical.) +// +// If the user doesn't give one with '--threads-max-mtasks', we'll set the +// maximum # of MTasks to +// (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD) +constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50; + +// end tunables. + +//###################################################################### +// Misc graph and assertion utilities + +static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) { +#if PART_STEPPED_COST + // Cached CP might be a little bigger than actual, due to stepped CPs. + // Example: + // Let's say we have a parent with stepped_cost 40 and a grandparent + // with stepped_cost 27. Our forward-cp is 67. Then our parent and + // grandparent get merged, the merged node has stepped cost 66. We + // won't propagate that new CP to children as it hasn't grown. So, + // children may continue to think that the CP coming through this path + // is a little higher than it really is; permit that. + UASSERT((((cached * 10) <= (actual * 11)) && (cached * 11) >= (actual * 10)), + "Calculation error in scoring (approximate, may need tweak)"); +#else + UASSERT(cached == actual, "Calculation error in scoring"); +#endif +} + +//============================================================================= +// We keep MTaskEdge graph edges in a PairingHeap, sorted by score and id + +struct EdgeKey final { + // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node + uint64_t m_id; // Unique ID part of edge score + uint32_t m_score; // Score part of ID + void increase(uint32_t score) { +#if VL_DEBUG + UASSERT(score >= m_score, "Must increase"); +#endif + m_score = score; + } + bool operator<(const EdgeKey& other) const { + // First by Score then by ID + return m_score < other.m_score || (m_score == other.m_score && m_id < other.m_id); + } +}; + +using EdgeHeap = PairingHeap; + +//============================================================================= +// LogicMTask + +class LogicMTask final : public AbstractLogicMTask { + VL_RTTI_IMPL(LogicMTask, AbstractLogicMTask) + template + friend class PartPropagateCp; + +public: + // TYPES + using VxList = std::list; + + struct CmpLogicMTask final { + bool operator()(const LogicMTask* ap, const LogicMTask* bp) const { + return ap->id() < bp->id(); + } + }; + +private: + // MEMBERS + + // Set of MTaskMoveVertex's assigned to this mtask. LogicMTask does not + // own the MTaskMoveVertex objects, we merely keep pointers to them + // here. + VxList m_mvertices; + + // Cost estimate for this LogicMTask, derived from V3InstrCount. + // In abstract time units. + uint32_t m_cost = 0; + + // Cost of critical paths going FORWARD from graph-start to the start + // of this vertex, and also going REVERSE from the end of the graph to + // the end of the vertex. Same units as m_cost. + std::array m_critPathCost; + + uint32_t m_serialId; // Unique MTask ID number + + // Count "generations" which are just operations that scan through the + // graph. We'll mark each node with the last generation that scanned + // it. We can use this to avoid recursing through the same node twice + // while searching for a path. + uint64_t m_generation = 0; + + // Store a set of forward relatives so we can quickly check if we have a given child + std::unordered_set m_edgeSet; + // Store the outgoing and incoming edges in a heap sorted by the critical path length + std::array m_edgeHeap; + + // MTasks for which a SiblingMC exists with 'this' as the higher ID MTask (m_ap in SiblingMC) + std::set m_siblings; + // List of SiblingMCs for which this is the higher ID MTask (m_ap in SiblingMC) + V3List m_aSiblingMCs; + // List of SiblingMCs for which this is the lower ID MTask (m_bp in SiblingMC) + V3List m_bSiblingMCs; + +public: + // CONSTRUCTORS + LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) + : AbstractLogicMTask{graphp} { + for (uint32_t& item : m_critPathCost) item = 0; + if (mtmvVxp) { // Else null for test + m_mvertices.push_back(mtmvVxp); + if (const OrderLogicVertex* const olvp = mtmvVxp->logicp()) { + m_cost += V3InstrCount::count(olvp->nodep(), true); + } + } + // Start at 1, so that 0 indicates no mtask ID. + static uint32_t s_nextId = 1; + m_serialId = s_nextId++; + UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks"); + } + + // METHODS + std::set& siblings() { return m_siblings; }; + V3List& aSiblingMCs() { return m_aSiblingMCs; }; + V3List& bSiblingMCs() { return m_bSiblingMCs; }; + + void moveAllVerticesFrom(LogicMTask* otherp) { + // splice() is constant time + m_mvertices.splice(m_mvertices.end(), otherp->m_mvertices); + m_cost += otherp->m_cost; + } + const VxList* vertexListp() const override { return &m_mvertices; } + static uint64_t incGeneration() { + static uint64_t s_generation = 0; + ++s_generation; + return s_generation; + } + + // Use this instead of pointer-compares to compare LogicMTasks. Avoids + // nondeterministic output. Also name mtasks based on this number in + // the final C++ output. + uint32_t id() const override { return m_serialId; } + void id(uint32_t id) { m_serialId = id; } + // Abstract cost of every logic mtask + uint32_t cost() const override VL_MT_SAFE { return m_cost; } + void setCost(uint32_t cost) { m_cost = cost; } // For tests only + uint32_t stepCost() const { return stepCost(m_cost); } + static uint32_t stepCost(uint32_t cost) { +#if PART_STEPPED_COST + // Round cost up to the nearest 5%. Use this when computing all + // critical paths. The idea is that critical path changes don't + // need to propagate when they don't exceed the next step, saving a + // lot of recursion. + if (cost == 0) return 0; + + double logcost = log(cost); + // log(1.05) is about 0.05 + // So, round logcost up to the next 0.05 boundary + logcost *= 20.0; + logcost = ceil(logcost); + logcost = logcost / 20.0; + + const uint32_t stepCost = static_cast(exp(logcost)); +#if VL_DEBUG + UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); + UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); +#endif + return stepCost; +#else + return cost; +#endif + } + + template + void addRelativeEdge(MTaskEdge* edgep); + template + void stealRelativeEdge(MTaskEdge* edgep); + template + void removeRelativeEdge(MTaskEdge* edgep); + + void addRelativeMTask(LogicMTask* relativep) { + // Add the relative to connecting edge map + VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second; +#if VL_DEBUG + UASSERT(!exits, "Adding existing relative"); +#endif + } + void removeRelativeMTask(LogicMTask* relativep) { + VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep); +#if VL_DEBUG + UASSERT(removed, "Relative should have been in set"); +#endif + } + bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); } + + void checkRelativesCp(GraphWay way) const; + + string name() const override VL_MT_STABLE { + // Display forward and reverse critical path costs. This gives a quick + // read on whether graph partitioning looks reasonable or bad. + std::ostringstream out; + out << "mt" << m_serialId << "." << this << " [b" << m_critPathCost[GraphWay::FORWARD] + << " a" << m_critPathCost[GraphWay::REVERSE] << " c" << cost(); + return out.str(); + } + + void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; } + uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; } + uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const; + +private: + static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top, + const V3GraphEdge* excludedEdgep, uint64_t generation) { + // Q) Why does this take LogicMTask instead of generic V3GraphVertex? + // A) We'll use the critical paths known to LogicMTask to prune the + // recursion for speed. Also store 'generation' in + // LogicMTask::m_generation so we can prune the search and avoid + // recursing through the same node more than once in a single + // search. + + if (fromp->m_generation == generation) { + // Already looked at this node in the current search. + // Since we're back again, we must not have found a path on the + // first go. + return false; + } + fromp->m_generation = generation; + + // Base case: we found a path. + if (fromp == top) return true; + + // Base case: fromp is too late, cannot possibly be a prereq for top. + if (fromp->critPathCost(GraphWay::REVERSE) + < (top->critPathCost(GraphWay::REVERSE) + top->stepCost())) { + return false; + } + if ((fromp->critPathCost(GraphWay::FORWARD) + fromp->stepCost()) + > top->critPathCost(GraphWay::FORWARD)) { + return false; + } + + // Recursively look for a path + for (const V3GraphEdge* followp = fromp->outBeginp(); followp; + followp = followp->outNextp()) { + if (followp == excludedEdgep) continue; + LogicMTask* const nextp = static_cast(followp->top()); + if (pathExistsFromInternal(nextp, top, nullptr, generation)) return true; + } + return false; + } + + // True if there's a path from 'fromp' to 'top' excluding + // 'excludedEdgep', false otherwise. + // + // 'excludedEdgep' may be nullptr in which case no edge is excluded. If + // 'excludedEdgep' is non-nullptr it must connect fromp and top. + // + // TODO: consider changing this API to the 'isTransitiveEdge' API + // used by GraphPathChecker +public: + static bool pathExistsFrom(LogicMTask* fromp, LogicMTask* top, + const V3GraphEdge* excludedEdgep) { + return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); + } + + static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment); + +private: + VL_UNCOPYABLE(LogicMTask); +}; + +//###################################################################### +// MTask utility classes + +// Sort AbstractMTask objects into deterministic order by calling id() +// which is a unique and stable serial number. +struct MTaskIdLessThan final { + bool operator()(const AbstractMTask* lhsp, const AbstractMTask* rhsp) const { + return lhsp->id() < rhsp->id(); + } +}; + +struct MergeCandidateKey final { + // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node + uint64_t m_id; // Unique ID part of edge score + uint32_t m_score; // Score part of ID + bool operator<(const MergeCandidateKey& other) const { + // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse + return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); + } +}; + +using MergeCandidateScoreboard = V3Scoreboard; + +// Information associated with scoreboarding a merge candidate +class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node { + // Only the known subclasses can create or delete one of these + friend class SiblingMC; + friend class MTaskEdge; + + // This structure is extremely hot. To save 8 bytes we pack + // one bit indicating removedFromSb with the id. To save another + // 8 bytes by not having a virtual function table, we implement the + // few polymorphic methods over the two known subclasses explicitly, + // using another bit of the id to denote the actual subtype. + + // By using the bottom bits for flags, we can still use < to compare IDs without masking. + // <63:1> Serial number for ordering, <0> subtype (SiblingMC) + static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 0; + static constexpr uint64_t ID_INCREMENT = 1ULL << 1; + + bool isSiblingMC() const { return m_key.m_id & IS_SIBLING_MASK; } + + // CONSTRUCTORS + explicit MergeCandidate(bool isSiblingMC) { + static uint64_t serial = 0; + serial += ID_INCREMENT; // +ID_INCREMENT so doesn't set the special bottom bits + m_key.m_id = serial | (isSiblingMC * IS_SIBLING_MASK); + } + ~MergeCandidate() = default; + +public: + // METHODS + SiblingMC* toSiblingMC(); // Instead of cast<>/as<> + const SiblingMC* toSiblingMC() const; // Instead of cast<>/as<> + MTaskEdge* toMTaskEdge(); // Instead of cast<>/as<> + const MTaskEdge* toMTaskEdge() const; // Instead of cast<>/as<> + bool mergeWouldCreateCycle() const; // Instead of virtual method + + inline void rescore(); + uint32_t score() const { return m_key.m_score; } + + static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) { + return static_cast(nodep); + } +}; + +static_assert(sizeof(MergeCandidate) == sizeof(MergeCandidateScoreboard::Node), + "Should not have a vtable"); + +// A pair of associated LogicMTask's that are merge candidates for sibling +// contraction +class SiblingMC final : public MergeCandidate { + LogicMTask* const m_ap; + LogicMTask* const m_bp; + + V3ListEnt m_aEnt; // List entry for m_ap->aSiblingMCs() + V3ListEnt m_bEnt; // List entry for m_bp->bSiblingMCs() + +public: + // CONSTRUCTORS + SiblingMC() = delete; + SiblingMC(LogicMTask* ap, LogicMTask* bp) + : MergeCandidate{/* isSiblingMC: */ true} + , m_ap{ap} + , m_bp{bp} { + // Storage management depends on this + UASSERT(ap->id() > bp->id(), "Should be ordered"); + UDEBUGONLY(UASSERT(ap->siblings().count(bp), "Should be in sibling map");); + m_aEnt.pushBack(m_ap->aSiblingMCs(), this); + m_bEnt.pushBack(m_bp->bSiblingMCs(), this); + } + ~SiblingMC() = default; + + // METHODS + SiblingMC* aNextp() const { return m_aEnt.nextp(); } + SiblingMC* bNextp() const { return m_bEnt.nextp(); } + void unlinkA() { + VL_ATTR_UNUSED const size_t removed = m_ap->siblings().erase(m_bp); + UDEBUGONLY(UASSERT(removed == 1, "Should have been in sibling set");); + m_aEnt.unlink(m_ap->aSiblingMCs(), this); + } + void unlinkB() { m_bEnt.unlink(m_bp->bSiblingMCs(), this); } + + LogicMTask* ap() const { return m_ap; } + LogicMTask* bp() const { return m_bp; } + bool mergeWouldCreateCycle() const { + return (LogicMTask::pathExistsFrom(m_ap, m_bp, nullptr) + || LogicMTask::pathExistsFrom(m_bp, m_ap, nullptr)); + } +}; + +static_assert(!std::is_polymorphic::value, "Should not have a vtable"); + +// GraphEdge for the MTask graph +class MTaskEdge final : public V3GraphEdge, public MergeCandidate { + VL_RTTI_IMPL(MTaskEdge, V3GraphEdge) + friend class LogicMTask; + template + friend class PartPropagateCp; + + // MEMBERS + // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes + // directly within the edge as they are always required and this makes association cheap. + std::array m_edgeHeapNode; + +public: + // CONSTRUCTORS + MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight) + : V3GraphEdge{graphp, fromp, top, weight} + , MergeCandidate{/* isSiblingMC: */ false} { + fromp->addRelativeMTask(top); + fromp->addRelativeEdge(this); + top->addRelativeEdge(this); + } + // METHODS + LogicMTask* furtherMTaskp(GraphWay way) const { + return static_cast(this->furtherp(way)); + } + LogicMTask* fromMTaskp() const { return static_cast(fromp()); } + LogicMTask* toMTaskp() const { return static_cast(top()); } + bool mergeWouldCreateCycle() const { + return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this); + } + // Following initial assignment of critical paths, clear this MTaskEdge + // out of the edge-map for each node and reinsert at a new location + // with updated critical path. + void resetCriticalPaths() { + LogicMTask* const fromp = fromMTaskp(); + LogicMTask* const top = toMTaskp(); + fromp->removeRelativeEdge(this); + top->removeRelativeEdge(this); + fromp->addRelativeEdge(this); + top->addRelativeEdge(this); + } + + uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; } + + // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge + static const MTaskEdge* toMTaskEdge(GraphWay way, const EdgeHeap::Node* nodep) { + const size_t offset = VL_OFFSETOF(MTaskEdge, m_edgeHeapNode[way]); + return reinterpret_cast(reinterpret_cast(nodep) - offset); + } + +private: + VL_UNCOPYABLE(MTaskEdge); +}; + +template +void LogicMTask::addRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv = way.invert(); + // Add to the edge heap + LogicMTask* const relativep = edgep->furtherMTaskp(way); + // Value is !way cp to this edge + const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv); + // + m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp}); +} + +template +void LogicMTask::stealRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + // Make heap node insertable, ruining the heap it is currently in. + edgep->m_edgeHeapNode[way].yank(); + // Add the edge as new + addRelativeEdge(edgep); +} + +template +void LogicMTask::removeRelativeEdge(MTaskEdge* edgep) { + constexpr GraphWay way{T_Way}; + // Remove from the edge heap + m_edgeHeap[way].remove(&edgep->m_edgeHeapNode[way]); +} + +void LogicMTask::checkRelativesCp(GraphWay way) const { + for (V3GraphEdge* edgep = beginp(way); edgep; edgep = edgep->nextp(way)) { + const LogicMTask* const relativep = static_cast(edgep->furtherp(way)); + const uint32_t cachedCp = static_cast(edgep)->cachedCp(way); + const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); + partCheckCachedScoreVsActual(cachedCp, cp); + } +} + +uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { + // Compute the critical path cost wayward to this node, without considering edge 'withoutp'. + // We need to look at two edges at most, the critical path if that is not via 'withoutp', + // or the second-worst path, if the critical path is via 'withoutp'. +#if VL_DEBUG + UASSERT(withoutp->furtherp(way) == this, + "In critPathCostWithout(), edge 'withoutp' must further to 'this'"); +#endif + const GraphWay inv = way.invert(); + const EdgeHeap& edgeHeap = m_edgeHeap[inv]; + const EdgeHeap::Node* const maxp = edgeHeap.max(); + if (!maxp) return 0; + if (MTaskEdge::toMTaskEdge(inv, maxp) != withoutp) return maxp->key().m_score; + const EdgeHeap::Node* const secp = edgeHeap.secondMax(); + if (!secp) return 0; + return secp->key().m_score; +} + +void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { + const string filename = v3Global.debugFilename(nameComment) + ".txt"; + UINFO(1, "Writing " << filename << endl); + const std::unique_ptr ofp{V3File::new_ofstream(filename)}; + std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr + if (osp->fail()) v3fatalStatic("Can't write " << filename); + + // Find start vertex with longest CP + LogicMTask* startp = nullptr; + for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + LogicMTask* const mtaskp = static_cast(vxp); + if (!startp) { + startp = mtaskp; + continue; + } + if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) + > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { + startp = mtaskp; + } + } + + // Follow the entire critical path + std::vector path; + uint32_t totalCost = 0; + for (LogicMTask* nextp = startp; nextp;) { + path.push_back(nextp); + totalCost += nextp->cost(); + + if (EdgeHeap::Node* const maxp = nextp->m_edgeHeap[GraphWay::FORWARD].max()) { + nextp = MTaskEdge::toMTaskEdge(GraphWay::FORWARD, maxp)->toMTaskp(); + } else { + nextp = nullptr; + } + } + + *osp << "totalCost = " << totalCost + << " (should match the computed critical path cost (CP) for the graph)\n"; + + // Dump + for (const LogicMTask* mtaskp : path) { + *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; + for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); + lit != mtaskp->vertexListp()->end(); ++lit) { + const OrderLogicVertex* const logicp = (*lit)->logicp(); + if (!logicp) continue; + if (false) { + // Show nodes only + *osp << "> "; + logicp->nodep()->dumpTree(*osp); + } else { + // Show nodes with hierarchical costs + V3InstrCount::count(logicp->nodep(), false, osp); + } + } + } +} + +// Instead of dynamic cast +SiblingMC* MergeCandidate::toSiblingMC() { + return isSiblingMC() ? static_cast(this) : nullptr; +} + +MTaskEdge* MergeCandidate::toMTaskEdge() { + return isSiblingMC() ? nullptr : static_cast(this); +} + +const SiblingMC* MergeCandidate::toSiblingMC() const { + return isSiblingMC() ? static_cast(this) : nullptr; +} + +const MTaskEdge* MergeCandidate::toMTaskEdge() const { + return isSiblingMC() ? nullptr : static_cast(this); +} + +// Normally this would be a virtual function, but we save space by not having a vtable, +// and we know we only have 2 possible subclasses. +bool MergeCandidate::mergeWouldCreateCycle() const { + return isSiblingMC() ? static_cast(this)->mergeWouldCreateCycle() + : static_cast(this)->mergeWouldCreateCycle(); +} + +static uint32_t siblingScore(const SiblingMC* sibsp) { + const LogicMTask* const ap = sibsp->ap(); + const LogicMTask* const bp = sibsp->bp(); + const uint32_t mergedCpCostFwd + = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); + const uint32_t mergedCpCostRev + = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); +} + +static uint32_t edgeScore(const MTaskEdge* edgep) { + // Score this edge. Lower is better. The score is the new local CP + // length if we merge these mtasks. ("Local" means the longest + // critical path running through the merged node.) + const LogicMTask* const top = static_cast(edgep->top()); + const LogicMTask* const fromp = static_cast(edgep->fromp()); + const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD), + top->critPathCostWithout(GraphWay::FORWARD, edgep)); + const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), + top->critPathCost(GraphWay::REVERSE)); + return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost()); +} + +void MergeCandidate::rescore() { + if (const SiblingMC* const sibp = toSiblingMC()) { + m_key.m_score = siblingScore(sibp); + } else { + // The '1 +' favors merging a SiblingMC over an otherwise- + // equal-scoring MTaskEdge. The comment on selfTest() talks + // about why. + m_key.m_score = 1 + edgeScore(static_cast(this)); + } +} + +//###################################################################### + +// Look at vertex costs (in one way) to form critical paths for each +// vertex. +static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool checkOnly) { + GraphStreamUnordered order(mtasksp, way); + const GraphWay rev = way.invert(); + for (const V3GraphVertex* vertexp; (vertexp = order.nextp());) { + const LogicMTask* const mtaskcp = static_cast(vertexp); + LogicMTask* const mtaskp = const_cast(mtaskcp); + uint32_t cpCost = 0; +#if VL_DEBUG + std::unordered_set relatives; +#endif + for (V3GraphEdge* edgep = vertexp->beginp(rev); edgep; edgep = edgep->nextp(rev)) { +#if VL_DEBUG + // Run a few asserts on the initial mtask graph, + // while we're iterating through... + UASSERT_OBJ(edgep->weight() != 0, mtaskp, "Should be no cut edges in mtasks graph"); + UASSERT_OBJ(relatives.find(edgep->furtherp(rev)) == relatives.end(), mtaskp, + "Should be no redundant edges in mtasks graph"); + relatives.insert(edgep->furtherp(rev)); +#endif + const LogicMTask* const relativep = static_cast(edgep->furtherp(rev)); + cpCost = std::max(cpCost, (relativep->critPathCost(way) + + static_cast(relativep->stepCost()))); + } + if (checkOnly) { + partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost); + } else { + mtaskp->setCritPathCost(way, cpCost); + } + } +} + +// Look at vertex costs to form critical paths for each vertex. +static void partInitCriticalPaths(V3Graph* mtasksp) { + partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false); + partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false); + + // Reset all MTaskEdges so that 'm_edges' will show correct CP numbers. + // They would have been all zeroes on initial creation of the MTaskEdges. + for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { + MTaskEdge* const mtedgep = edgep->as(); + mtedgep->resetCriticalPaths(); + } + } +} + +// Do an EXPENSIVE check to make sure that all incremental CP updates have +// gone correctly. +static void partCheckCriticalPaths(V3Graph* mtasksp) { + partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true); + partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true); + for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + const LogicMTask* const mtaskp = static_cast(vxp); + mtaskp->checkRelativesCp(GraphWay::FORWARD); + mtaskp->checkRelativesCp(GraphWay::REVERSE); + } +} + +// ###################################################################### +// PartPropagateCp + +// Propagate increasing critical path (CP) costs through a graph. +// +// Usage: +// * Client increases the cost and/or CP at a node or small set of nodes +// (often a pair in practice, eg. edge contraction.) +// * Client calls PartPropagateCp::cpHasIncreased() one or more times. +// Each call indicates that the inclusive CP of some "seed" vertex +// has increased to a given value. +// * NOTE: PartPropagateCp will neither read nor modify the cost +// or CPs at the seed vertices, it only accesses and modifies +// vertices wayward from the seeds. +// * Client calls PartPropagateCp::go(). Internally, this iteratively +// propagates the new CPs wayward through the graph. +// +template +class PartPropagateCp final { + // TYPES + + // We keep pending vertices in a heap during critical path propagation + struct PendingKey final { + LogicMTask* m_mtaskp; // The vertex in the heap + uint32_t m_score; // The score of this entry + void increase(uint32_t score) { +#if VL_DEBUG + UASSERT(score >= m_score, "Must increase"); +#endif + m_score = score; + } + bool operator<(const PendingKey& other) const { + if (m_score != other.m_score) return m_score < other.m_score; + return LogicMTask::CmpLogicMTask{}(m_mtaskp, other.m_mtaskp); + } + }; + + using PendingHeap = PairingHeap; + using PendingHeapNode = typename PendingHeap::Node; + + // MEMBERS + PendingHeap m_pendingHeap; // Heap of pending rescores + + // We allocate this many heap nodes at once + static constexpr size_t ALLOC_CHUNK_SIZE = 128; + PendingHeapNode* m_freep = nullptr; // List of free heap nodes + std::vector> m_allocated; // Allocated heap nodes + + const bool m_slowAsserts; // Enable nontrivial asserts + std::set m_seen; // Used only with slow asserts to check mtasks visited only once + +public: + // CONSTRUCTORS + explicit PartPropagateCp(bool slowAsserts) + : m_slowAsserts{slowAsserts} {} + + // METHODS +private: + // Allocate a HeapNode for the given element + PendingHeapNode* allocNode() { + // If no free nodes available, then make some + if (!m_freep) { + // Allocate in chunks for efficiency + m_allocated.emplace_back(new PendingHeapNode[ALLOC_CHUNK_SIZE]); + // Set up free list pointer + m_freep = m_allocated.back().get(); + // Set up free list chain + for (size_t i = 1; i < ALLOC_CHUNK_SIZE; ++i) { + m_freep[i - 1].m_next.m_ptr = &m_freep[i]; + } + // Clear the next pointer of the last entry + m_freep[ALLOC_CHUNK_SIZE - 1].m_next.m_ptr = nullptr; + } + // Free nodes are available, pick up the first one + PendingHeapNode* const resultp = m_freep; + m_freep = resultp->m_next.m_ptr; + resultp->m_next.m_ptr = nullptr; + return resultp; + } + + // Release a heap node (make it available for future allocation) + void freeNode(PendingHeapNode* nodep) { + // Re-use the existing link pointers and simply prepend it to the free list + nodep->m_next.m_ptr = m_freep; + m_freep = nodep; + } + +public: + void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv{way.invert()}; + + // For *vxp, whose CP-inclusive has just increased to + // newInclusiveCp, iterate to all wayward nodes, update the edges + // of each, and add each to m_pending if its overall CP has grown. + for (MTaskEdge *edgep = static_cast(vxp->beginp(way)), *nextp; edgep; + edgep = nextp) { + // Fetch early as likely cache miss + nextp = static_cast(edgep->nextp(way)); + + LogicMTask* const relativep = edgep->furtherMTaskp(way); + EdgeHeap::Node& edgeHeapNode = edgep->m_edgeHeapNode[inv]; + if (newInclusiveCp > edgeHeapNode.key().m_score) { + relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp); + } + + const uint32_t critPathCost = relativep->critPathCost(way); + + if (critPathCost >= newInclusiveCp) continue; + + // relativep's critPathCost() is out of step with its longest !wayward edge. + // Schedule that to be resolved. + const uint32_t newVal = newInclusiveCp - critPathCost; + + if (PendingHeapNode* const nodep = static_cast(relativep->userp())) { + // Already in heap. Increase score if needed. + if (newVal > nodep->key().m_score) m_pendingHeap.increaseKey(nodep, newVal); + continue; + } + + // Add to heap + PendingHeapNode* const nodep = allocNode(); + relativep->userp(nodep); + m_pendingHeap.insert(nodep, {relativep, newVal}); + } + } + + void go() { + constexpr GraphWay way{T_Way}; + constexpr GraphWay inv{way.invert()}; + + // m_pending maps each pending vertex to the amount that it wayward + // CP will grow. + // + // We can iterate over the pending set in reverse order, always + // choosing the nodes with the largest pending CP-growth. + // + // The intuition is: if the original seed node had its CP grow by + // 50, the most any wayward node can possibly grow is also 50. So + // for anything pending to grow by 50, we know we can process it + // once and we won't have to grow its CP again on the current pass. + // After we're done with all the grow-by-50s, nothing else will + // grow by 50 again on the current pass, and we can process the + // grow-by-49s and we know we'll only have to process each one + // once. And so on. + // + // This generalizes to multiple seed nodes also. + while (!m_pendingHeap.empty()) { + // Pop max element from heap + PendingHeapNode* const maxp = m_pendingHeap.max(); + m_pendingHeap.remove(maxp); + // Pick up values + LogicMTask* const mtaskp = maxp->key().m_mtaskp; + const uint32_t cpGrowBy = maxp->key().m_score; + // Free the heap node, we are done with it + freeNode(maxp); + mtaskp->userp(nullptr); + // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges + const uint32_t startCp = mtaskp->critPathCost(way); + const uint32_t newCp = startCp + cpGrowBy; + if (VL_UNLIKELY(m_slowAsserts)) { + // Check that CP matches that of the longest edge wayward of vxp. + const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score; + UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge"); + // Confirm that we only set each node's CP once. That's an + // important property of PartPropagateCp which allows it to be far + // faster than a recursive algorithm on some graphs. + const bool first = m_seen.insert(mtaskp).second; + UASSERT_OBJ(first, mtaskp, "Set CP on node twice"); + } + mtaskp->setCritPathCost(way, newCp); + cpHasIncreased(mtaskp, newCp + mtaskp->stepCost()); + } + + if (VL_UNLIKELY(m_slowAsserts)) m_seen.clear(); + } + +private: + VL_UNCOPYABLE(PartPropagateCp); +}; + +class PartPropagateCpSelfTest final { + // MEMBERS + V3Graph m_graph; // A graph + std::array m_vx; // All vertices within the graph + + // CONSTRUCTORS + PartPropagateCpSelfTest() = default; + ~PartPropagateCpSelfTest() = default; + + void go() { + // Generate a pseudo-random graph + std::array rngState + = {{0x12345678ULL, 0x9abcdef0ULL}}; // GCC 3.8.0 wants {{}} + // Create 50 vertices + for (auto& i : m_vx) { + i = new LogicMTask{&m_graph, nullptr}; + i->setCost(1); + } + // Create 250 edges at random. Edges must go from + // lower-to-higher index vertices, so we get a DAG. + for (unsigned i = 0; i < 250; ++i) { + const unsigned idx1 = V3Os::rand64(rngState) % 50; + const unsigned idx2 = V3Os::rand64(rngState) % 50; + if (idx1 > idx2) { + if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) { + new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1}; + } + } else if (idx2 > idx1) { + if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) { + new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1}; + } + } + } + + partInitCriticalPaths(&m_graph); + + // This SelfTest class is also the T_CostAccessor + PartPropagateCp prop(true); + + // Seed the propagator with every input node; + // This should result in the complete graph getting all CP's assigned. + for (const auto& i : m_vx) { + if (!i->inBeginp()) prop.cpHasIncreased(i, 1 /* inclusive CP starts at 1 */); + } + + // Run the propagator. + prop.go(); + + // Finally, confirm that the entire graph appears to have correct CPs. + partCheckCriticalPaths(&m_graph); + } + +public: + static void selfTest() { PartPropagateCpSelfTest{}.go(); } +}; + +// Merge edges from a LogicMtask. +// +// This code removes adjacent edges. When this occurs, mark it in need +// of a rescore, in case its score has fallen and we need to move it up +// toward the front of the scoreboard. +// +// Wait, what? Shouldn't the scores only increase as we merge nodes? Well +// that's almost true. But there is one exception. +// +// Suppose we have A->B, B->C, and A->C. +// +// The A->C edge is a "transitive" edge. It's ineligible to be merged, as +// the merge would create a cycle. We score it on the scoreboard like any +// other edge. +// +// However, our "score" estimate for A->C is bogus, because the forward +// critical path to C and the reverse critical path to A both contain the +// same node (B) so we overestimate the score of A->C. At first this +// doesn't matter, since transitive edges aren't eligible to merge anyway. +// +// Later, suppose the edge contractor decides to merge the B->C edge, with +// B donating all its incoming edges into C, say. (So we reach this +// function.) +// +// With B going away, the A->C edge will no longer be transitive and it +// will become eligible to merge. But if we don't mark it for rescore, +// it'll stay in the scoreboard with its old (overestimate) score. We'll +// merge it too late due to the bogus score. When we finally merge it, we +// fail the assert in the main edge contraction loop which checks that the +// actual score did not fall below the scoreboard's score. +// +// Another way of stating this: this code ensures that scores of +// non-transitive edges only ever increase. +static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp, + MergeCandidateScoreboard* sbp) { + + // Process outgoing edges + MTaskEdge* outNextp = static_cast(donorp->outBeginp()); + while (outNextp) { + MTaskEdge* const edgep = outNextp; + LogicMTask* const relativep = outNextp->toMTaskp(); + outNextp = static_cast(outNextp->outNextp()); + + relativep->removeRelativeEdge(edgep); + + if (recipientp->hasRelativeMTask(relativep)) { + // An edge already exists between recipient and relative of donor. + // Mark it in need of a rescore + if (sbp) { + if (sbp->contains(edgep)) sbp->remove(edgep); + MTaskEdge* const existMTaskEdgep = static_cast( + recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep)); +#if VL_DEBUG + UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); +#endif + if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); + } + VL_DO_DANGLING(edgep->unlinkDelete(), edgep); + } else { + // No existing edge between recipient and relative of donor. + // Redirect the edge from donor<->relative to recipient<->relative. + edgep->relinkFromp(recipientp); + recipientp->addRelativeMTask(relativep); + recipientp->stealRelativeEdge(edgep); + relativep->addRelativeEdge(edgep); + if (sbp) { + if (!sbp->contains(edgep)) { + sbp->add(edgep); + } else { + sbp->hintScoreChanged(edgep); + } + } + } + } + + // Process incoming edges + MTaskEdge* inNextp = static_cast(donorp->inBeginp()); + while (inNextp) { + MTaskEdge* const edgep = inNextp; + LogicMTask* const relativep = inNextp->fromMTaskp(); + inNextp = static_cast(inNextp->inNextp()); + + relativep->removeRelativeMTask(donorp); + relativep->removeRelativeEdge(edgep); + + if (relativep->hasRelativeMTask(recipientp)) { + // An edge already exists between recipient and relative of donor. + // Mark it in need of a rescore + if (sbp) { + if (sbp->contains(edgep)) sbp->remove(edgep); + MTaskEdge* const existMTaskEdgep = static_cast( + recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep)); +#if VL_DEBUG + UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); +#endif + if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); + } + VL_DO_DANGLING(edgep->unlinkDelete(), edgep); + } else { + // No existing edge between recipient and relative of donor. + // Redirect the edge from donor<->relative to recipient<->relative. + edgep->relinkTop(recipientp); + relativep->addRelativeMTask(recipientp); + relativep->addRelativeEdge(edgep); + recipientp->stealRelativeEdge(edgep); + if (sbp) { + if (!sbp->contains(edgep)) { + sbp->add(edgep); + } else { + sbp->hintScoreChanged(edgep); + } + } + } + } + + // Remove donorp from the graph + VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp); +} + +//###################################################################### +// PartContraction + +// Perform edge or sibling contraction on the partition graph +class PartContraction final { + // TYPES + // New CP information for mtaskp reflecting an upcoming merge + struct NewCp final { + uint32_t cp; + uint32_t propagateCp; + bool propagate; + }; + + // MEMBERS + V3Graph* const m_mtasksp; // Mtask graph + uint32_t m_scoreLimit; // Sloppy score allowed when picking merges + uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at + unsigned m_mergesSinceRescore = 0; // Merges since last rescore + const bool m_slowAsserts; // Take extra time to validate algorithm + MergeCandidateScoreboard m_sb; // Scoreboard + + PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator + PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator + + LogicMTask* const m_entryMTaskp; // Singular source vertex of the dependency graph + LogicMTask* const m_exitMTaskp; // Singular sink vertex of the dependency graph + +public: + // CONSTRUCTORS + PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, LogicMTask* entryMTaskp, + LogicMTask* exitMTaskp, bool slowAsserts) + : m_mtasksp{mtasksp} + , m_scoreLimit{scoreLimit} + , m_slowAsserts{slowAsserts} + , m_entryMTaskp{entryMTaskp} + , m_exitMTaskp{exitMTaskp} {} + + // METHODS + void go() { + if (m_slowAsserts) { + // Check there are no redundant edges + for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; + itp = itp->verticesNextp()) { + std::unordered_set neighbors; + for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { + const bool first = neighbors.insert(edgep->top()).second; + UASSERT_OBJ(first, itp, "Redundant edge found in input to PartContraction()"); + } + } + } + + unsigned maxMTasks = v3Global.opt.threadsMaxMTasks(); + if (maxMTasks == 0) { // Unspecified so estimate + if (v3Global.opt.threads() > 1) { + maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD * v3Global.opt.threads()); + } else { + // Running PartContraction with --threads <= 1 means self-test + maxMTasks = 500; + } + } + + // OPTIMIZATION PASS: Edge contraction and sibling contraction. + // - Score each pair of mtasks which is a candidate to merge. + // * Each edge defines such a candidate pair + // * Two mtasks that are prereqs or postreqs of a common third + // vertex are "siblings", these are also a candidate pair. + // - Build a list of MergeCandidates, sorted by score. + // - Merge the best pair. + // - Incrementally recompute critical paths near the merged mtask. + + for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { + itp->userp(nullptr); // Reset user value while we are here. Used by PartPropagateCp. + for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { + m_sb.add(static_cast(edgep)); + } + siblingPairFromRelatives(itp); + siblingPairFromRelatives(itp); + } + + doRescore(); // Set initial scores in scoreboard + + while (true) { + // This is the best edge to merge, with the lowest + // score (shortest local critical path) + MergeCandidate* const mergeCanp = m_sb.best(); + if (!mergeCanp) { + // Scoreboard found no eligible merges. Maybe a rescore + // will produce some merge-able pairs? + if (m_sb.needsRescore()) { + doRescore(); + continue; + } + break; + } + + if (m_slowAsserts) { + UASSERT(!m_sb.needsRescore(mergeCanp), + "Need-rescore items should not be returned by bestp"); + } + const uint32_t cachedScore = mergeCanp->score(); + mergeCanp->rescore(); + const uint32_t actualScore = mergeCanp->score(); + + if (actualScore > cachedScore) { + // Cached score is out-of-date. + // Mark this elem as in need of a rescore and continue. + m_sb.hintScoreChanged(mergeCanp); + continue; + } + // ... we'll also confirm that actualScore hasn't shrunk relative + // to cached score, after the mergeWouldCreateCycle() check. + + if (actualScore > m_scoreLimit) { + // Our best option isn't good enough + if (m_sb.needsRescore()) { + // Some pairs need a rescore, maybe those will be + // eligible to merge afterward. + doRescore(); + continue; + } else { + // We've exhausted everything below m_scoreLimit; stop. + + // Except, if we have too many mtasks, raise the score + // limit and keep going... + unsigned mtaskCount = 0; + for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; + vxp = vxp->verticesNextp()) { + ++mtaskCount; + } + if (mtaskCount > maxMTasks) { + const uint32_t oldLimit = m_scoreLimit; + m_scoreLimit = (m_scoreLimit * 120) / 100; + v3Global.rootp()->fileline()->v3warn( + UNOPTTHREADS, "Thread scheduler is unable to provide requested " + "parallelism; suggest asking for fewer threads."); + UINFO(1, "Critical path limit was=" << oldLimit << " now=" << m_scoreLimit + << endl); + continue; + } + // Really stop + break; + } + } + if (actualScore > m_scoreLimitBeforeRescore) { + // Time to rescore, that will result in a higher + // scoreLimitBeforeRescore, and possibly lower-scoring + // elements returned from bestp(). + doRescore(); + continue; + } + + // Avoid merging the entry/exit nodes. This would create serialization, by forcing the + // merged MTask to run before/after everything else. Empirically this helps + // performance in a modest way by allowing other MTasks to start earlier. + if (MTaskEdge* const edgep = mergeCanp->toMTaskEdge()) { + if (edgep->fromp() == m_entryMTaskp || edgep->top() == m_exitMTaskp) { + m_sb.remove(mergeCanp); + continue; + } + } + + // Avoid merging any edge that would create a cycle. + // + // For example suppose we begin with vertices A, B, C and edges + // A->B, B->C, A->C. + // + // Suppose we want to merge A->C into a single vertex. + // New edges would be AC->B and B->AC which is not a DAG. + // Do not allow this. + if (mergeCanp->mergeWouldCreateCycle()) { + // Remove this candidate from scoreboard so we don't keep + // reconsidering it on every loop. + m_sb.remove(mergeCanp); + if (SiblingMC* const smcp = mergeCanp->toSiblingMC()) { + smcp->unlinkA(); + smcp->unlinkB(); + delete smcp; + } + continue; + } + + partCheckCachedScoreVsActual(cachedScore, actualScore); + + // Finally there's no cycle risk, no need to rescore, we're + // within m_scoreLimit and m_scoreLimitBeforeRescore. + // This is the edge to merge. + // + // Bookkeeping: if this is the first edge we'll merge since + // the last rescore, compute the new m_scoreLimitBeforeRescore + // to be somewhat higher than this edge's score. + if (m_mergesSinceRescore == 0) { +#if PART_STEPPED_RESCORELIMIT + m_scoreLimitBeforeRescore = (actualScore * 105) / 100; +#else + m_scoreLimitBeforeRescore = actualScore; +#endif + + // This print can serve as a progress indicator, as it + // increases from low numbers up toward cpLimit. It may be + // helpful to see progress during slow partitions. Maybe + // display something by default even? + UINFO(6, "New scoreLimitBeforeRescore: " << m_scoreLimitBeforeRescore << endl); + } + + // Finally merge this candidate. + contract(mergeCanp); + } + } + +private: + template + NewCp newCp(LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) { + constexpr GraphWay way{T_Way}; + // Return new wayward-CP for mtaskp reflecting its upcoming merge + // with otherp. Set 'result.propagate' if mtaskp's wayward + // relatives will see a new wayward CP from this merge. + uint32_t newCp; + if (mergeEdgep) { + if (mtaskp == mergeEdgep->furtherp(way)) { + newCp = std::max(otherp->critPathCost(way), + mtaskp->critPathCostWithout(way, mergeEdgep)); + } else { + newCp = std::max(mtaskp->critPathCost(way), + otherp->critPathCostWithout(way, mergeEdgep)); + } + } else { + newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way)); + } + + const uint32_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost(); + const uint32_t newRelativesCp + = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost()); + + NewCp result; + result.cp = newCp; + result.propagate = (newRelativesCp > origRelativesCp); + result.propagateCp = newRelativesCp; + return result; + } + + void removeSiblingMCsWith(LogicMTask* mtaskp) { + for (SiblingMC *smcp = mtaskp->aSiblingMCs().begin(), *nextp; // lintok-begin-on-ref + smcp; smcp = nextp) { + nextp = smcp->aNextp(); + m_sb.remove(smcp); + smcp->unlinkB(); + delete smcp; + } + for (SiblingMC *smcp = mtaskp->bSiblingMCs().begin(), *nextp; // lintok-begin-on-ref + smcp; smcp = nextp) { + nextp = smcp->bNextp(); + m_sb.remove(smcp); + smcp->unlinkA(); + delete smcp; + } + } + + void removeSiblingMCs(LogicMTask* recipientp, LogicMTask* donorp) { + // The lists here should be disjoint (there should be only one SiblingMC involving these + // two MTasks, and we removed that elsewhere), so no need for unlinking from the lists we + // are clearing. + removeSiblingMCsWith(recipientp); + removeSiblingMCsWith(donorp); + + // Clear the sibling map of the recipient. The donor will be deleted anyway, so we can + // leave that in a corrupt for efficiency. + recipientp->siblings().clear(); + recipientp->aSiblingMCs().reset(); + recipientp->bSiblingMCs().reset(); + } + + void contract(MergeCandidate* mergeCanp) { + LogicMTask* top = nullptr; + LogicMTask* fromp = nullptr; + MTaskEdge* const mergeEdgep = mergeCanp->toMTaskEdge(); + SiblingMC* const mergeSibsp = mergeCanp->toSiblingMC(); + if (mergeEdgep) { + top = static_cast(mergeEdgep->top()); + fromp = static_cast(mergeEdgep->fromp()); + } else { + top = mergeSibsp->ap(); + fromp = mergeSibsp->bp(); + } + + // Merge the smaller mtask into the larger mtask. If one of them + // is much larger, this will save time in partRedirectEdgesFrom(). + // Assume the more costly mtask has more edges. + // + // [TODO: now that we have edge maps, we could count the edges + // exactly without a linear search.] + LogicMTask* recipientp; + LogicMTask* donorp; + if (fromp->cost() > top->cost()) { + recipientp = fromp; + donorp = top; + } else { + donorp = fromp; + recipientp = top; + } + VL_DANGLING(fromp); + VL_DANGLING(top); // Use donorp and recipientp now instead + + // Recursively update forward and reverse CP numbers. + // + // Doing this before merging the mtasks lets us often avoid + // recursing through either incoming or outgoing edges on one or + // both mtasks. + // + // These 'NewCp' objects carry a bit indicating whether we must + // propagate CP for each of the four cases: + const NewCp recipientNewCpFwd = newCp(recipientp, donorp, mergeEdgep); + const NewCp donorNewCpFwd = newCp(donorp, recipientp, mergeEdgep); + const NewCp recipientNewCpRev = newCp(recipientp, donorp, mergeEdgep); + const NewCp donorNewCpRev = newCp(donorp, recipientp, mergeEdgep); + + m_sb.remove(mergeCanp); + + if (mergeEdgep) { + // Remove and free the connecting edge. Must do this before propagating CP's below. + mergeEdgep->fromMTaskp()->removeRelativeMTask(mergeEdgep->toMTaskp()); + mergeEdgep->fromMTaskp()->removeRelativeEdge(mergeEdgep); + mergeEdgep->toMTaskp()->removeRelativeEdge(mergeEdgep); + VL_DO_DANGLING(mergeEdgep->unlinkDelete(), mergeEdgep); + } else { + // Remove the siblingMC + mergeSibsp->unlinkA(); + mergeSibsp->unlinkB(); + VL_DO_DANGLING(delete mergeEdgep, mergeEdgep); + } + + // This also updates cost and stepCost on recipientp + recipientp->moveAllVerticesFrom(donorp); + + UINFO(9, "recipient = " << recipientp->id() << ", donor = " << donorp->id() + << ", mergeEdgep = " << mergeEdgep << "\n" + << "recipientNewCpFwd = " << recipientNewCpFwd.cp + << (recipientNewCpFwd.propagate ? " true " : " false ") + << recipientNewCpFwd.propagateCp << "\n" + << "donorNewCpFwd = " << donorNewCpFwd.cp + << (donorNewCpFwd.propagate ? " true " : " false ") + << donorNewCpFwd.propagateCp << endl); + + recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp); + if (recipientNewCpFwd.propagate) { + m_forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); + } + recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp); + if (recipientNewCpRev.propagate) { + m_reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); + } + if (donorNewCpFwd.propagate) { + m_forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); + } + if (donorNewCpRev.propagate) { + m_reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); + } + m_forwardPropagator.go(); + m_reversePropagator.go(); + + // Remove all other SiblingMCs that include recipientp or donorp. We remove all siblingMCs + // of recipientp so we do not get huge numbers of SiblingMCs. We'll recreate them below, up + // to a bounded number. + removeSiblingMCs(recipientp, donorp); + + // Redirect all edges, delete donorp + partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); + + ++m_mergesSinceRescore; + + // Do an expensive check, confirm we haven't botched the CP + // updates. + if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp); + + // Finally, make new sibling pairs as needed: + // - prereqs and postreqs of recipientp + // - prereqs of recipientp's postreqs + // - postreqs of recipientp's prereqs + // Note that this depends on the updated critical paths (above). + siblingPairFromRelatives(recipientp); + siblingPairFromRelatives(recipientp); + unsigned edges = 0; + for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) { + LogicMTask* const postreqp = static_cast(edgep->top()); + siblingPairFromRelatives(postreqp); + ++edges; + if (edges >= PART_SIBLING_EDGE_LIMIT) break; + } + edges = 0; + for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) { + LogicMTask* const prereqp = static_cast(edgep->fromp()); + siblingPairFromRelatives(prereqp); + ++edges; + if (edges >= PART_SIBLING_EDGE_LIMIT) break; + } + } + + void doRescore() { + // During rescore, we know that graph isn't changing, so allow + // the critPathCost*Without() routines to cache some data in + // each LogicMTask. This is just an optimization, things should + // behave identically without the caching (just slower) + + m_sb.rescore(); + UINFO(6, "Did rescore. Merges since previous = " << m_mergesSinceRescore << endl); + + m_mergesSinceRescore = 0; + m_scoreLimitBeforeRescore = 0xffffffff; + } + + void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) { + if (ap->id() < bp->id()) std::swap(ap, bp); + // The higher id vertex owns the association set + const auto first = ap->siblings().insert(bp).second; + if (first) { + m_sb.add(new SiblingMC{ap, bp}); + } else if (VL_UNLIKELY(m_slowAsserts)) { + // It's fine if we already have this SiblingMC, we may have + // created it earlier. Just confirm that we have associated data. + bool found = false; + for (const SiblingMC* smcp = ap->aSiblingMCs().begin(); // lintok-begin-on-ref + smcp; smcp = smcp->aNextp()) { + UASSERT_OBJ(smcp->ap() == ap, ap, "Inconsistent SiblingMC"); + UASSERT_OBJ(m_sb.contains(smcp), ap, "Must be on the scoreboard"); + if (smcp->bp() == bp) found = true; + } + UASSERT_OBJ(found, ap, "Sibling not found"); + } + } + + template + void siblingPairFromRelatives(V3GraphVertex* mtaskp) { + constexpr GraphWay way{T_Way}; + // Need at least 2 edges + if (!mtaskp->beginp(way) || !mtaskp->beginp(way)->nextp(way)) return; + + std::array neighbors; + + // This is a hot method, so we want so sort as efficiently as possible. We pre-load + // all data (critical path cost and id) required for determining ordering into an aligned + // structure. There is not enough space next to these to keep a whole pointer within 16 + // bytes, so we store an index into the neighbors buffer instead. We can then compare + // and swap these sorting records very efficiently. With this the standard library sorting + // functions are efficient enough and using more optimized methods (e.g.: sorting networks) + // has no measurable benefit. + struct alignas(16) SortingRecord final { + uint64_t m_id; + uint32_t m_cp; + uint8_t m_idx; + static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits::max(), + "m_idx must fit all indices into 'neighbors'"); + bool operator<(const SortingRecord& that) const { + return m_cp < that.m_cp || (m_cp == that.m_cp && m_id < that.m_id); + } + }; + static_assert(sizeof(SortingRecord) <= 16, "How could this be padded to more than 16?"); + + std::array sortRecs; + size_t n = 0; + + // Populate the buffers + for (V3GraphEdge *edgep = mtaskp->beginp(way), *nextp; edgep; edgep = nextp) { + nextp = edgep->nextp(way); // Fetch next first as likely cache miss + LogicMTask* const otherp = static_cast(edgep->furtherp(way)); + neighbors[n] = otherp; + sortRecs[n].m_id = otherp->id(); + sortRecs[n].m_cp = otherp->critPathCost(way) + otherp->cost(); + sortRecs[n].m_idx = n; + ++n; + // Prevent nodes with huge numbers of edges from massively slowing down us down + if (n >= PART_SIBLING_EDGE_LIMIT) break; + } + + // Don't make all possible pairs of siblings when not requested (non-exhaustive). + // Just make a few pairs. + constexpr size_t MAX_NONEXHAUSTIVE_PAIRS = 3; + + if (Exhaustive || n <= 2 * MAX_NONEXHAUSTIVE_PAIRS) { + const size_t end = n & ~static_cast(1); // Round down to even, (we want pairs) + std::sort(sortRecs.begin(), sortRecs.begin() + n); + for (size_t i = 0; i < end; i += 2) { + makeSiblingMC(neighbors[sortRecs[i].m_idx], neighbors[sortRecs[i + 1].m_idx]); + } + } else { + constexpr size_t end = 2 * MAX_NONEXHAUSTIVE_PAIRS; + std::partial_sort(sortRecs.begin(), sortRecs.begin() + end, sortRecs.begin() + n); + for (size_t i = 0; i < end; i += 2) { + makeSiblingMC(neighbors[sortRecs[i].m_idx], neighbors[sortRecs[i + 1].m_idx]); + } + } + } + + // SELF TESTS + + // This is a performance test, its intent is to demonstrate that the + // partitioner doesn't run on this chain in N^2 time or worse. Overall + // runtime should be N*log(N) for a chain-shaped graph. + // + static void selfTestChain() { + const uint64_t usecsSmall = partitionChainUsecs(5); + const uint64_t usecsLarge = partitionChainUsecs(500); + // Large input is 50x bigger than small input. + // Its runtime should be about 10x longer -- not about 2500x longer + // or worse which would suggest N^2 scaling or worse. + UASSERT(usecsLarge < (usecsSmall * 1500), + "selfTestChain() took longer than expected. Small input runtime = " + << usecsSmall << ", large input runtime = " << usecsLarge); + } + + static uint64_t partitionChainUsecs(unsigned chain_len) { + // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. + const uint64_t startUsecs = V3Os::timeUsecs(); + V3Graph mtasks; + LogicMTask* lastp = nullptr; + for (unsigned i = 0; i < chain_len; ++i) { + LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + mtp->setCost(1); + if (lastp) new MTaskEdge{&mtasks, lastp, mtp, 1}; + lastp = mtp; + } + partInitCriticalPaths(&mtasks); + + // Since slowAsserts mode is *expected* to cause N^2 runtime, and the + // intent of this test is to demonstrate better-than-N^2 runtime, disable + // slowAsserts. + PartContraction ec{&mtasks, + // Any CP limit >chain_len should work: + chain_len * 2, nullptr, nullptr, false /* slowAsserts */}; + ec.go(); + + // All vertices should merge into one + UASSERT_SELFTEST( + bool, mtasks.verticesBeginp() && !mtasks.verticesBeginp()->verticesNextp(), true); + + const uint64_t endUsecs = V3Os::timeUsecs(); + const uint64_t elapsedUsecs = endUsecs - startUsecs; + + return elapsedUsecs; + } + + // This test defends against a particular failure mode that the + // partitioner exhibited during development: + // + // At one time, the partitioner consistently favored edge-merges over + // equal-scoring sibling merges. Every edge and sibling merge in this + // test starts out with an equal score. If you only do edge-merges, all + // possible merges will continue to have equal score as the center node + // grows and grows. Soon the critical path budget is exhausted by a + // large center node, and we still have many small leaf nodes -- it's + // literally the worst partition possible. + // + // Now, instead, the partitioner gives slight favoritism to sibling + // merges in the event that scores are tied. This is better for the + // test and also real designs. + static void selfTestX() { + // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. + V3Graph mtasks; + LogicMTask* const centerp = new LogicMTask{&mtasks, nullptr}; + centerp->setCost(1); + unsigned i; + for (i = 0; i < 50; ++i) { + LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + mtp->setCost(1); + // Edge from every input -> centerp + new MTaskEdge{&mtasks, mtp, centerp, 1}; + } + for (i = 0; i < 50; ++i) { + LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; + mtp->setCost(1); + // Edge from centerp -> every output + new MTaskEdge{&mtasks, centerp, mtp, 1}; + } + + partInitCriticalPaths(&mtasks); + PartContraction{&mtasks, 20, nullptr, nullptr, true}.go(); + + const auto report = mtasks.parallelismReport( + [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); + + // Checking exact values here is maybe overly precise. What we're + // mostly looking for is a healthy reduction in the number of mtasks. + UASSERT_SELFTEST(uint32_t, report.criticalPathCost(), 19); + UASSERT_SELFTEST(uint32_t, report.totalGraphCost(), 101); + UASSERT_SELFTEST(uint32_t, report.vertexCount(), 14); + UASSERT_SELFTEST(uint32_t, report.edgeCount(), 13); + } + +public: + static void selfTest() { + selfTestX(); + selfTestChain(); + } + +private: + VL_UNCOPYABLE(PartContraction); +}; + +//###################################################################### +// DpiImportCallVisitor + +// Scan node, indicate whether it contains a call to a DPI imported +// routine. +class DpiImportCallVisitor final : public VNVisitor { + bool m_hasDpiHazard = false; // Found a DPI import call. + bool m_tracingCall = false; // Iterating into a CCall to a CFunc + // METHODS + void visit(AstCFunc* nodep) override { + if (!m_tracingCall) return; + m_tracingCall = false; + if (nodep->dpiImportWrapper()) { + if (nodep->dpiPure() ? !v3Global.opt.threadsDpiPure() + : !v3Global.opt.threadsDpiUnpure()) { + m_hasDpiHazard = true; + } + } + iterateChildren(nodep); + } + void visit(AstNodeCCall* nodep) override { + iterateChildren(nodep); + // Enter the function and trace it + m_tracingCall = true; + iterate(nodep->funcp()); + } + void visit(AstNode* nodep) override { iterateChildren(nodep); } + +public: + // CONSTRUCTORS + explicit DpiImportCallVisitor(AstNode* nodep) { iterate(nodep); } + bool hasDpiHazard() const { return m_hasDpiHazard; } + ~DpiImportCallVisitor() override = default; + +private: + VL_UNCOPYABLE(DpiImportCallVisitor); +}; + +//###################################################################### +// PartFixDataHazards + +// Fix data hazards in the partition graph. +// +// The fine-grained graph from V3Order may contain data hazards which are +// not a problem for serial mode, but which would be a problem in parallel +// mode. +// +// There are basically two classes: unordered pairs of writes, and +// unordered write-read pairs. We fix both here, with a combination of +// MTask-merges and new edges to ensure no such unordered pairs remain. +// +// ABOUT UNORDERED WRITE-WRITE PAIRS +// +// The V3Order dependency graph treats these as unordered events: +// +// a) sig[15:8] = stuff; +// ... +// b) sig[7:0] = other_stuff; +// +// Seems OK right? They are writes to disjoint bits of the same +// signal. They can run in either order, in serial mode, and the result +// will be the same. +// +// The resulting C code for each of this isn't a pure write, it's +// actually an R-M-W sequence: +// +// a) sig = (sig & 0xff) | (0xff00 & (stuff << 8)); +// ... +// b) sig = (sig & 0xff00) | (0xff & other_stuff); +// +// In serial mode, order doesn't matter so long as these run serially. +// In parallel mode, we must serialize these RMW's to avoid a race. +// +// We don't actually check here if each write would involve an R-M-W, we +// just assume that it would. If this routine ever causes a drastic +// increase in critical path, it could be optimized to make a better +// prediction (with all the risk that word implies!) about whether a +// given write is likely to turn into an R-M-W. +// +// ABOUT UNORDERED WRITE-READ PAIRS +// +// If we don't put unordered write-read pairs into some order at Verilation +// time, we risk a runtime race. +// +// How do such unordered writer/reader pairs happen? Here's a partial list +// of scenarios: +// +// Case 1: Circular logic +// +// If the design has circular logic, V3Order has by now generated some +// dependency cycles, and also cut some of the edges to make it +// acyclic. +// +// For serial mode, that was fine. We can break logic circles at an +// arbitrary point. At runtime, we'll repeat the _eval() until no +// changes are detected, which papers over the discarded dependency. +// +// For parallel mode, this situation can lead to unordered reads and +// writes of the same variable, causing a data race. For example if the +// original code is this: +// +// assign b = b | a << 2; +// assign out = b; +// +// ... there's originally a dependency edge which records that 'b' +// depends on the first assign. V3Order may cut this edge, making the +// statements unordered. In serial mode that's fine, they can run in +// either order. In parallel mode it's a reader/writer race. +// +// Case 2: Race Condition in Verilog Sources +// +// If the input has races, eg. blocking assignments in always blocks +// that share variables, the graph at this point will contain unordered +// writes and reads (or unordered write-write pairs) reflecting that. +// +// Case 3: Interesting V3Order Behavior +// +// There's code in V3Order that explicitly avoids making a dependency +// edge from a clock-gater signal to the logic node that produces the +// clock signal. This leads to unordered reader/writer pairs in +// parallel mode. +// +class PartFixDataHazards final { + // TYPES + using TasksByRank = std::map>; + + // MEMBERS + const OrderGraph* const m_orderGraphp; // The OrderGraph + V3Graph* const m_mtasksp; // Mtask graph +public: + // CONSTRUCTORs + explicit PartFixDataHazards(const OrderGraph* orderGraphp, V3Graph* mtasksp) + : m_orderGraphp{orderGraphp} + , m_mtasksp{mtasksp} {} + // METHODS +private: + void findAdjacentTasks(const OrderVarStdVertex* varVtxp, TasksByRank& tasksByRank) { + // Find all writer tasks for this variable, group by rank. + for (V3GraphEdge* edgep = varVtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { + if (const auto* const logicVtxp = edgep->fromp()->cast()) { + LogicMTask* const writerMtaskp = static_cast(logicVtxp->userp()); + tasksByRank[writerMtaskp->rank()].insert(writerMtaskp); + } + } + // Not: Find all reader tasks for this variable, group by rank. + // There was "broken" code here to find readers, but fixing it to + // work properly harmed performance on some tests, see issue #3360. + } + void mergeSameRankTasks(const TasksByRank& tasksByRank) { + LogicMTask* lastRecipientp = nullptr; + for (const auto& pair : tasksByRank) { + // Find the largest node at this rank, merge into it. (If we + // happen to find a huge node, this saves time in + // partRedirectEdgesFrom() versus merging into an arbitrary node.) + LogicMTask* recipientp = nullptr; + for (LogicMTask* const mtaskp : pair.second) { + if (!recipientp || (recipientp->cost() < mtaskp->cost())) recipientp = mtaskp; + } + UASSERT_OBJ(!lastRecipientp || (lastRecipientp->rank() < recipientp->rank()), + recipientp, "Merging must be on lower rank"); + + for (LogicMTask* const donorp : pair.second) { + // Merge donor into recipient. + if (donorp == recipientp) continue; + // Fix up the map, so donor's OLVs map to recipientp + for (const MTaskMoveVertex* const tmvp : *(donorp->vertexListp())) { + tmvp->logicp()->userp(recipientp); + } + // Move all vertices from donorp to recipientp + recipientp->moveAllVerticesFrom(donorp); + // Redirect edges from donorp to recipientp, delete donorp + partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, nullptr); + } + + if (lastRecipientp && !lastRecipientp->hasRelativeMTask(recipientp)) { + new MTaskEdge{m_mtasksp, lastRecipientp, recipientp, 1}; + } + lastRecipientp = recipientp; + } + } + bool hasDpiHazard(LogicMTask* mtaskp) { + for (const MTaskMoveVertex* const moveVtxp : *(mtaskp->vertexListp())) { + if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) { + // NOTE: We don't handle DPI exports. If testbench code calls a + // DPI-exported function at any time during eval() we may have + // a data hazard. (Likewise in non-threaded mode if an export + // messes with an ordered variable we're broken.) + + // Find all calls to DPI-imported functions, we can put those + // into a serial order at least. That should solve the most + // likely DPI-related data hazards. + if (DpiImportCallVisitor{lvtxp->nodep()}.hasDpiHazard()) return true; + } + } + return false; + } + +public: + void go() { + // Rank the graph. DGS is faster than V3GraphAlg's recursive rank, and also allows us to + // set up the OrderLogicVertex -> LogicMTask map at the same time. + { + GraphStreamUnordered serialize{m_mtasksp}; + while (LogicMTask* const mtaskp + = const_cast(static_cast(serialize.nextp()))) { + // Compute and assign rank + uint32_t rank = 0; + for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) { + rank = std::max(edgep->fromp()->rank() + 1, rank); + } + mtaskp->rank(rank); + + // Set up the OrderLogicVertex -> LogicMTask map + // Entry and exit MTasks have no MTaskMoveVertices under them, so move on + if (mtaskp->vertexListp()->empty()) continue; + // Otherwise there should be only one MTaskMoveVertex in each MTask at this stage + UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, + "Multiple MTaskMoveVertex"); + const MTaskMoveVertex* const moveVtxp = mtaskp->vertexListp()->front(); + // Set up mapping back to the MTask from the OrderLogicVertex + if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) lvtxp->userp(mtaskp); + } + } + + // Gather all variables. SystemC vars will be handled slightly specially, so keep separate. + std::vector regularVars; + std::vector systemCVars; + for (V3GraphVertex *vtxp = m_orderGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + // Only consider OrderVarStdVertex which reflects + // an actual lvalue assignment; the others do not. + if (const OrderVarStdVertex* const vvtxp = vtxp->cast()) { + if (vvtxp->vscp()->varp()->isSc()) { + systemCVars.push_back(vvtxp); + } else { + regularVars.push_back(vvtxp); + } + } + } + + // For each OrderVarVertex, look at its writer and reader mtasks. + // + // If there's a set of writers and readers at the same rank, we + // know these are unordered with respect to one another, so merge + // those mtasks all together. + // + // At this point, we have at most one merged mtask per rank (for a + // given OVV.) Create edges across these remaining mtasks to ensure + // they run in serial order (going along with the existing ranks.) + // + // NOTE: we don't update the CP's stored in the LogicMTasks to + // reflect the changes we make to the graph. That's OK, as we + // haven't yet initialized CPs when we call this routine. + for (const OrderVarStdVertex* const varVtxp : regularVars) { + // Build a set of mtasks, per rank, which access this var. + // Within a rank, sort by MTaskID to avoid nondeterminism. + TasksByRank tasksByRank; + + // Find all reader and writer tasks for this variable, add to + // tasksByRank. + findAdjacentTasks(varVtxp, tasksByRank); + + // Merge all writer and reader tasks from same rank together. + // + // NOTE: Strictly speaking, we don't need to merge all the + // readers together. That may lead to extra serialization. The + // least amount of ordering we could impose here would be to + // merge all writers at a given rank together; then make edges + // from the merged writer node to each reader node at the same + // rank; and then from each reader node to the merged writer at + // the next rank. + // + // Whereas, merging all readers and writers at the same rank + // together is "the simplest thing that could possibly work" + // and it seems to. It also creates fairly few edges. We don't + // want to create tons of edges here, doing so is not nice to + // the main edge contraction pass. + mergeSameRankTasks(tasksByRank); + } + + // Handle SystemC vars just a little differently. Instead of + // treating each var as an independent entity, and serializing + // writes to that one var, we treat ALL systemC vars as a single + // entity and serialize writes (and, conservatively, reads) across + // all of them. + // + // Reasoning: writing a systemC var actually turns into a call to a + // var.write() method, which under the hood is accessing some data + // structure that's shared by many SC vars. It's not thread safe. + // + // Hopefully we only have a few SC vars -- top level ports, probably. + { + TasksByRank tasksByRank; + for (const OrderVarStdVertex* const varVtxp : systemCVars) { + findAdjacentTasks(varVtxp, tasksByRank); + } + mergeSameRankTasks(tasksByRank); + } + + // Handle nodes containing DPI calls, we want to serialize those + // by default unless user gave --threads-dpi-concurrent. + // Same basic strategy as above to serialize access to SC vars. + if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) { + TasksByRank tasksByRank; + for (V3GraphVertex *vtxp = m_mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + LogicMTask* const mtaskp = static_cast(vtxp); + if (hasDpiHazard(mtaskp)) tasksByRank[mtaskp->rank()].insert(mtaskp); + } + mergeSameRankTasks(tasksByRank); + } + } + +private: + VL_UNCOPYABLE(PartFixDataHazards); +}; + +//###################################################################### +// V3Partition implementation + +void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) { + if (!debug() && !dumpLevel() && !dumpGraphLevel()) return; + + UINFO(4, "\n"); + UINFO(4, " Stats for " << stage << endl); + uint32_t mtaskCount = 0; + uint32_t totalCost = 0; + std::array mtaskCostHist; + mtaskCostHist.fill(0); + + for (const V3GraphVertex* mtaskp = graphp->verticesBeginp(); mtaskp; + mtaskp = mtaskp->verticesNextp()) { + ++mtaskCount; + uint32_t mtaskCost = mtaskp->as()->cost(); + totalCost += mtaskCost; + + unsigned log2Cost = 0; + while (mtaskCost >>= 1) ++log2Cost; + UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats"); + ++mtaskCostHist[log2Cost]; + } + UINFO(4, " Total mtask cost = " << totalCost << "\n"); + UINFO(4, " Mtask count = " << mtaskCount << "\n"); + UINFO(4, " Avg cost / mtask = " + << ((mtaskCount > 0) ? cvtToStr(totalCost / mtaskCount) : "INF!") << "\n"); + UINFO(4, " Histogram of mtask costs:\n"); + for (unsigned i = 0; i < 32; ++i) { + if (mtaskCostHist[i]) { + UINFO(4, " 2^" << i << ": " << mtaskCostHist[i] << endl); + V3Stats::addStat("MTask graph, " + stage + ", mtask cost 2^" + (i < 10 ? " " : "") + + cvtToStr(i), + mtaskCostHist[i]); + } + } + + if (mtaskCount < 1000) { + string filePrefix("ordermv_"); + filePrefix += stage; + if (dumpGraphLevel() >= 4) graphp->dumpDotFilePrefixedAlways(filePrefix); + } + + // Look only at the cost of each mtask, neglect communication cost. + // This will show us how much parallelism we expect, assuming cache-miss + // costs are minor and the cost of running logic is the dominant cost. + const auto report = graphp->parallelismReport( + [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); + V3Stats::addStat("MTask graph, " + stage + ", critical path cost", report.criticalPathCost()); + V3Stats::addStat("MTask graph, " + stage + ", total graph cost", report.totalGraphCost()); + V3Stats::addStat("MTask graph, " + stage + ", mtask count", report.vertexCount()); + V3Stats::addStat("MTask graph, " + stage + ", edge count", report.edgeCount()); + V3Stats::addStat("MTask graph, " + stage + ", parallelism factor", report.parallelismFactor()); + if (debug() >= 4) { + UINFO(0, "\n"); + UINFO(0, " MTask Parallelism estimate based costs at stage" << stage << ":\n"); + UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n"); + UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n"); + UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n"); + UINFO(0, " Edge count = " << report.edgeCount() << "\n"); + UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n"); + } +} + +// Print a hash of the shape of graphp. If you are battling +// nondeterminism, this can help to pinpoint where in the pipeline it's +// creeping in. +void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) { + // Disabled when there are no nondeterminism issues in flight. + if (!v3Global.opt.debugNondeterminism()) return; + + std::unordered_map vx2Id; + unsigned id = 0; + for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + vx2Id[vxp] = id++; + } + unsigned hash = 0; + for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { + for (const V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { + const V3GraphVertex* const top = edgep->top(); + hash = vx2Id[top] + 31U * hash; // The K&R hash function + } + } + UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl); +} + +// Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask +// graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph of: +// - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex +// (MTaskMoveVertex::logicp() != nullptr) +// - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair +// Our goal is to order the logic vertices. The second type of variable/domain vertices only carry +// dependencies and are eventually discarded. In order to reduce the working set size of +// PartContraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, and +// instead add the transitive dependencies directly, but only if adding the transitive edges +// directly does not require more dependency edges than keeping the intermediate vertex. That is, +// we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be true if fanIn +// or fanOut are 1, or if they are both 2. This can cause significant reduction in working set +// size. +static bool bypassOk(MTaskMoveVertex* mvtxp) { + // Need to keep all logic vertices + if (mvtxp->logicp()) return false; + // Count fan-in, up to 3 + unsigned fanIn = 0; + for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { + if (++fanIn == 3) break; + } + UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn");); + // If fanInn no more than one, bypass + if (fanIn <= 1) return true; + // Count fan-out, up to 3 + unsigned fanOut = 0; + for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) { + if (++fanOut == 3) break; + } + UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut");); + // If fan-out no more than one, bypass + if (fanOut <= 1) return true; + // They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2) + return fanIn + fanOut == 4; +} + +uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { + uint32_t totalGraphCost = 0; + + // Artificial single entry point vertex in the MTask graph to allow sibling merges. + // This is required as otherwise disjoint sub-graphs could not be merged, but the + // coarsening algorithm assumes that the graph is connected. + m_entryMTaskp = new LogicMTask{mtasksp, nullptr}; + + // The V3InstrCount within LogicMTask will set user1 on each AST + // node, to assert that we never count any node twice. + const VNUser1InUse user1inUse; + + // Create the LogicMTasks for each MTaskMoveVertex + for (V3GraphVertex *vtxp = m_fineDepsGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + MTaskMoveVertex* const mVtxp = static_cast(vtxp); + if (bypassOk(mVtxp)) { + mVtxp->userp(nullptr); // Set to nullptr to mark as bypassed + } else { + LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp}; + mVtxp->userp(mtaskp); + totalGraphCost += mtaskp->cost(); + } + } + + // Artificial single exit point vertex in the MTask graph to allow sibling merges. + // this enables merging MTasks with no downstream dependents if that is the ideal merge. + m_exitMTaskp = new LogicMTask{mtasksp, nullptr}; + + // Create the mtask->mtask dependency edges based on the dependencies between MTaskMoveVertex + // vertices. + for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + LogicMTask* const mtaskp = static_cast(vtxp); + + // Entry and exit vertices handled separately + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; + + // At this point, there should only be one MTaskMoveVertex per LogicMTask + UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex"); + MTaskMoveVertex* const mvtxp = mtaskp->vertexListp()->front(); + UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask"); + + // Function to add a edge to a dependent from 'mtaskp' + const auto addEdge = [mtasksp, mtaskp](LogicMTask* otherp) { + UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge"); + if (mtaskp->hasRelativeMTask(otherp)) return; // Don't create redundant edges. + new MTaskEdge{mtasksp, mtaskp, otherp, 1}; + }; + + // Iterate downstream direct dependents + for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) { + dNextp = dEdgep->outNextp(); + V3GraphVertex* const top = dEdgep->top(); + if (LogicMTask* const otherp = static_cast(top->userp())) { + // The opposite end of the edge is not a bypassed vertex, add as direct dependent + addEdge(otherp); + } else { + // The opposite end of the edge is a bypassed vertex, add transitive dependents + for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; tEdgep = tNextp) { + tNextp = tEdgep->outNextp(); + LogicMTask* const transp = static_cast(tEdgep->top()->userp()); + // The Move graph is bipartite (logic <-> var), and logic is never bypassed, + // hence 'transp' must be non nullptr. + UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex"); + addEdge(transp); + } + } + } + } + + // Create Dependencies to/from the entry/exit vertices. + for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { + nextp = vtxp->verticesNextp(); + LogicMTask* const mtaskp = static_cast(vtxp); + + if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; + + // Add the entry/exit edges + if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, m_entryMTaskp, mtaskp, 1}; + if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, m_exitMTaskp, 1}; + } + + return totalGraphCost; +} + +void V3Partition::go(V3Graph* mtasksp) { + // Called by V3Order + hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps"); + + // Create the first MTasks. Initially, each MTask just wraps one + // MTaskMoveVertex. Over time, we'll merge MTasks together and + // eventually each MTask will wrap a large number of MTaskMoveVertices + // (and the logic nodes therein.) + const uint32_t totalGraphCost = setupMTaskDeps(mtasksp); + + V3Partition::debugMTaskGraphStats(mtasksp, "initial"); + + // For debug: print out the longest critical path. This allows us to + // verify that the costs look reasonable, that we aren't combining + // nodes that should probably be split, etc. + if (dumpLevel() >= 3) LogicMTask::dumpCpFilePrefixed(mtasksp, "cp"); + + // Merge nodes that could present data hazards; see comment within. + { + PartFixDataHazards{m_orderGraphp, mtasksp}.go(); + V3Partition::debugMTaskGraphStats(mtasksp, "hazards"); + hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()"); + } + + // Setup the critical path into and out of each node. + partInitCriticalPaths(mtasksp); + hashGraphDebug(mtasksp, "after partInitCriticalPaths()"); + + // Order the graph. We know it's already ranked from fixDataHazards() + // so we don't need to rank it again. + // + // On at least some models, ordering the graph here seems to help + // performance. (Why? Is it just triggering noise in a lucky direction? + // Is it just as likely to harm results?) + // + // More diversity of models that can build with --threads will + // eventually tell us. For now keep the order() so we don't forget + // about it, in case it actually helps. TODO: get more data and maybe + // remove this later if it doesn't really help. + mtasksp->orderPreRanked(); + + const int targetParFactor = v3Global.opt.threads(); + UASSERT(targetParFactor >= 2, "Should not reach V3Partition when --threads <= 1"); + + // Set cpLimit to roughly totalGraphCost / nThreads + // + // Actually set it a bit lower, by a hardcoded fudge factor. This + // results in more smaller mtasks, which helps reduce fragmentation + // when scheduling them. + const unsigned fudgeNumerator = 3; + const unsigned fudgeDenominator = 5; + const uint32_t cpLimit + = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator)); + UINFO(4, "V3Partition set cpLimit = " << cpLimit << endl); + + // Merge MTask nodes together, repeatedly, until the CP budget is + // reached. Coarsens the graph, usually by several orders of + // magnitude. + // + // Some tests disable this, hence the test on threadsCoarsen(). + // Coarsening is always enabled in production. + if (v3Global.opt.threadsCoarsen()) { + PartContraction{mtasksp, cpLimit, m_entryMTaskp, m_exitMTaskp, + // --debugPartition is used by tests + // to enable slow assertions. + v3Global.opt.debugPartition()} + .go(); + V3Partition::debugMTaskGraphStats(mtasksp, "contraction"); + } + { + mtasksp->removeTransitiveEdges(); + V3Partition::debugMTaskGraphStats(mtasksp, "transitive1"); + } + + // Reassign MTask IDs onto smaller numbers, which should be more stable + // across small logic changes. Keep MTask IDs in the same relative + // order though, otherwise we break CmpLogicMTask for still-existing + // EdgeSet's that haven't destructed yet. + { + using SortedMTaskSet = std::set; + SortedMTaskSet sorted; + for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { + LogicMTask* const mtaskp = static_cast(itp); + sorted.insert(mtaskp); + } + for (auto it = sorted.begin(); it != sorted.end(); ++it) { + // We shouldn't perturb the sort order of the set, despite + // changing the IDs, they should all just remain in the same + // relative order. Confirm that: + const uint32_t nextId = v3Global.rootp()->allocNextMTaskID(); + UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here"); + UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n"); + (*it)->id(nextId); + } + } + + // Set color to indicate an mtaskId on every underlying MTaskMoveVertex. + for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { + const LogicMTask* const mtaskp = static_cast(itp); + for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); + it != mtaskp->vertexListp()->end(); ++it) { + MTaskMoveVertex* const mvertexp = *it; + mvertexp->color(mtaskp->id()); + } + } +} + +void V3Order::selfTestParallel() { + UINFO(2, __FUNCTION__ << ": " << endl); + PartPropagateCpSelfTest::selfTest(); + PartContraction::selfTest(); +} + // Sort MTaskMoveVertex vertices by domain, then by scope, based on teh order they are encountered class OrderVerticesByDomainThenScope final { mutable uint64_t m_nextId = 0; // Next id to use diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp deleted file mode 100644 index effe0b509..000000000 --- a/src/V3Partition.cpp +++ /dev/null @@ -1,3210 +0,0 @@ -// -*- mode: C++; c-file-style: "cc-mode" -*- -//************************************************************************* -// DESCRIPTION: Verilator: Threading's logic to mtask partitioner -// -// Code available from: https://verilator.org -// -//************************************************************************* -// -// Copyright 2003-2024 by Wilson Snyder. This program is free software; you -// can redistribute it and/or modify it under the terms of either the GNU -// Lesser General Public License Version 3 or the Perl Artistic License -// Version 2.0. -// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 -// -//************************************************************************* - -#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT - -#include "V3Partition.h" - -#include "V3Config.h" -#include "V3EmitCBase.h" -#include "V3File.h" -#include "V3GraphStream.h" -#include "V3InstrCount.h" -#include "V3List.h" -#include "V3Os.h" -#include "V3PairingHeap.h" -#include "V3PartitionGraph.h" -#include "V3Scoreboard.h" -#include "V3Stats.h" -#include "V3UniqueNames.h" - -#include -#include -#include -#include -#include -#include -#include - -VL_DEFINE_DEBUG_FUNCTIONS; - -class LogicMTask; -class MTaskEdge; -class MergeCandidate; -class SiblingMC; - -// ###################################################################### -// Partitioner tunable settings: -// -// Before describing these settings, a bit of background: -// -// Early during the development of the partitioner, V3Split was failing to -// split large always blocks (with ~100K assignments) so we had to handle -// very large vertices with ~100K incoming and outgoing edges. -// -// The partitioner attempts to deal with such densely connected -// graphs. Some of the tuning parameters below reference "huge vertices", -// that's what they're talking about, vertices with tens of thousands of -// edges in and out. Whereas most graphs have only tens of edges in and out -// of most vertices. -// -// V3Split has since been fixed to more reliably split large always -// blocks. It's kind of an open question whether the partitioner must -// handle huge nodes gracefully. Maybe not! But it still can, given -// appropriate tuning. - -// PART_SIBLING_EDGE_LIMIT (integer) -// -// Arbitrarily limit the number of edges on a single vertex that will be -// considered when enumerating siblings, to the given value. This protects -// the partitioner runtime in the presence of huge vertices. -// -// The sibling-merge is less important than the edge merge. (You can -// totally disable the sibling merge and get halfway decent partitions; you -// can't disable edge merges, those are fundamental to the process.) So, -// skipping the enumeration of some siblings on a few vertices does not -// have a large impact on the result of the partitioner. -// -// If your vertices are small, the limit (at 26) approaches a no-op. Hence -// there's basically no cost to applying this limit even when we don't -// expect huge vertices. -// -// If you don't care about partitioner runtime and you want the most -// aggressive partition, set the limit very high. If you have huge -// vertices, leave this as is. -constexpr unsigned PART_SIBLING_EDGE_LIMIT = 26; - -// PART_STEPPED_COST (defined/undef) -// -// When computing critical path costs, use a step function on the actual -// underlying vertex cost. -// -// If there are huge vertices, when a tiny vertex merges into a huge -// vertex, we can often avoid increasing the huge vertex's stepped cost. -// If the stepped cost hasn't increased, and the critical path into the huge -// vertex hasn't increased, we can avoid propagating a new critical path to -// vertices past the huge vertex. Since huge vertices tend to have huge lists -// of children and parents, this can be a substantial savings. -// -// Does not seem to reduce the quality of the partitioner's output. -// -// If you have huge vertices, leave this 'true', it is the major setting -// that allows the partitioner to handle such difficult graphs on anything -// like a human time scale. -// -// If you don't have huge vertices, the 'true' value doesn't help much but -// should cost almost nothing in terms of partitioner quality. -// -// If you want the most aggressive possible partition, set it "false" and -// be prepared to be disappointed when the improvement in the partition is -// negligible / in the noise. -// -// Q) Why retain the control, if there is really no downside? -// -// A) Cost stepping can lead to corner cases. A developer may wish to -// disable cost stepping to rule it out as the cause of unexpected -// behavior. -#define PART_STEPPED_COST true - -// Don't produce more than a certain maximum number of MTasks. This helps -// the TSP variable sort not to blow up (a concern for some of the tests) -// and we probably don't want a huge number of mtasks in practice anyway -// (50 to 100 is typical.) -// -// If the user doesn't give one with '--threads-max-mtasks', we'll set the -// maximum # of MTasks to -// (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD) -constexpr unsigned PART_DEFAULT_MAX_MTASKS_PER_THREAD = 50; - -// end tunables. - -//###################################################################### -// Misc graph and assertion utilities - -static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) { -#if PART_STEPPED_COST - // Cached CP might be a little bigger than actual, due to stepped CPs. - // Example: - // Let's say we have a parent with stepped_cost 40 and a grandparent - // with stepped_cost 27. Our forward-cp is 67. Then our parent and - // grandparent get merged, the merged node has stepped cost 66. We - // won't propagate that new CP to children as it hasn't grown. So, - // children may continue to think that the CP coming through this path - // is a little higher than it really is; permit that. - UASSERT((((cached * 10) <= (actual * 11)) && (cached * 11) >= (actual * 10)), - "Calculation error in scoring (approximate, may need tweak)"); -#else - UASSERT(cached == actual, "Calculation error in scoring"); -#endif -} - -//============================================================================= -// We keep MTaskEdge graph edges in a PairingHeap, sorted by score and id - -struct EdgeKey final { - // Node: Structure layout chosen to minimize padding in PairingHeao<*>::Node - uint64_t m_id; // Unique ID part of edge score - uint32_t m_score; // Score part of ID - void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif - m_score = score; - } - bool operator<(const EdgeKey& other) const { - // First by Score then by ID - return m_score < other.m_score || (m_score == other.m_score && m_id < other.m_id); - } -}; - -using EdgeHeap = PairingHeap; - -//============================================================================= -// LogicMTask - -class LogicMTask final : public AbstractLogicMTask { - VL_RTTI_IMPL(LogicMTask, AbstractLogicMTask) - template - friend class PartPropagateCp; - -public: - // TYPES - using VxList = std::list; - - struct CmpLogicMTask final { - bool operator()(const LogicMTask* ap, const LogicMTask* bp) const { - return ap->id() < bp->id(); - } - }; - -private: - // MEMBERS - - // Set of MTaskMoveVertex's assigned to this mtask. LogicMTask does not - // own the MTaskMoveVertex objects, we merely keep pointers to them - // here. - VxList m_mvertices; - - // Cost estimate for this LogicMTask, derived from V3InstrCount. - // In abstract time units. - uint32_t m_cost = 0; - - // Cost of critical paths going FORWARD from graph-start to the start - // of this vertex, and also going REVERSE from the end of the graph to - // the end of the vertex. Same units as m_cost. - std::array m_critPathCost; - - uint32_t m_serialId; // Unique MTask ID number - - // Count "generations" which are just operations that scan through the - // graph. We'll mark each node with the last generation that scanned - // it. We can use this to avoid recursing through the same node twice - // while searching for a path. - uint64_t m_generation = 0; - - // Store a set of forward relatives so we can quickly check if we have a given child - std::unordered_set m_edgeSet; - // Store the outgoing and incoming edges in a heap sorted by the critical path length - std::array m_edgeHeap; - - // MTasks for which a SiblingMC exists with 'this' as the higher ID MTask (m_ap in SiblingMC) - std::set m_siblings; - // List of SiblingMCs for which this is the higher ID MTask (m_ap in SiblingMC) - V3List m_aSiblingMCs; - // List of SiblingMCs for which this is the lower ID MTask (m_bp in SiblingMC) - V3List m_bSiblingMCs; - -public: - // CONSTRUCTORS - LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp) - : AbstractLogicMTask{graphp} { - for (uint32_t& item : m_critPathCost) item = 0; - if (mtmvVxp) { // Else null for test - m_mvertices.push_back(mtmvVxp); - if (const OrderLogicVertex* const olvp = mtmvVxp->logicp()) { - m_cost += V3InstrCount::count(olvp->nodep(), true); - } - } - // Start at 1, so that 0 indicates no mtask ID. - static uint32_t s_nextId = 1; - m_serialId = s_nextId++; - UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks"); - } - - // METHODS - std::set& siblings() { return m_siblings; }; - V3List& aSiblingMCs() { return m_aSiblingMCs; }; - V3List& bSiblingMCs() { return m_bSiblingMCs; }; - - void moveAllVerticesFrom(LogicMTask* otherp) { - // splice() is constant time - m_mvertices.splice(m_mvertices.end(), otherp->m_mvertices); - m_cost += otherp->m_cost; - } - const VxList* vertexListp() const override { return &m_mvertices; } - static uint64_t incGeneration() { - static uint64_t s_generation = 0; - ++s_generation; - return s_generation; - } - - // Use this instead of pointer-compares to compare LogicMTasks. Avoids - // nondeterministic output. Also name mtasks based on this number in - // the final C++ output. - uint32_t id() const override { return m_serialId; } - void id(uint32_t id) { m_serialId = id; } - // Abstract cost of every logic mtask - uint32_t cost() const override VL_MT_SAFE { return m_cost; } - void setCost(uint32_t cost) { m_cost = cost; } // For tests only - uint32_t stepCost() const { return stepCost(m_cost); } - static uint32_t stepCost(uint32_t cost) { -#if PART_STEPPED_COST - // Round cost up to the nearest 5%. Use this when computing all - // critical paths. The idea is that critical path changes don't - // need to propagate when they don't exceed the next step, saving a - // lot of recursion. - if (cost == 0) return 0; - - double logcost = log(cost); - // log(1.05) is about 0.05 - // So, round logcost up to the next 0.05 boundary - logcost *= 20.0; - logcost = ceil(logcost); - logcost = logcost / 20.0; - - const uint32_t stepCost = static_cast(exp(logcost)); -#if VL_DEBUG - UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded"); - UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded"); -#endif - return stepCost; -#else - return cost; -#endif - } - - template - void addRelativeEdge(MTaskEdge* edgep); - template - void stealRelativeEdge(MTaskEdge* edgep); - template - void removeRelativeEdge(MTaskEdge* edgep); - - void addRelativeMTask(LogicMTask* relativep) { - // Add the relative to connecting edge map - VL_ATTR_UNUSED const bool exits = !m_edgeSet.emplace(relativep).second; -#if VL_DEBUG - UASSERT(!exits, "Adding existing relative"); -#endif - } - void removeRelativeMTask(LogicMTask* relativep) { - VL_ATTR_UNUSED const size_t removed = m_edgeSet.erase(relativep); -#if VL_DEBUG - UASSERT(removed, "Relative should have been in set"); -#endif - } - bool hasRelativeMTask(LogicMTask* relativep) const { return m_edgeSet.count(relativep); } - - void checkRelativesCp(GraphWay way) const; - - string name() const override VL_MT_STABLE { - // Display forward and reverse critical path costs. This gives a quick - // read on whether graph partitioning looks reasonable or bad. - std::ostringstream out; - out << "mt" << m_serialId << "." << this << " [b" << m_critPathCost[GraphWay::FORWARD] - << " a" << m_critPathCost[GraphWay::REVERSE] << " c" << cost(); - return out.str(); - } - - void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; } - uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; } - uint32_t critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const; - -private: - static bool pathExistsFromInternal(LogicMTask* fromp, LogicMTask* top, - const V3GraphEdge* excludedEdgep, uint64_t generation) { - // Q) Why does this take LogicMTask instead of generic V3GraphVertex? - // A) We'll use the critical paths known to LogicMTask to prune the - // recursion for speed. Also store 'generation' in - // LogicMTask::m_generation so we can prune the search and avoid - // recursing through the same node more than once in a single - // search. - - if (fromp->m_generation == generation) { - // Already looked at this node in the current search. - // Since we're back again, we must not have found a path on the - // first go. - return false; - } - fromp->m_generation = generation; - - // Base case: we found a path. - if (fromp == top) return true; - - // Base case: fromp is too late, cannot possibly be a prereq for top. - if (fromp->critPathCost(GraphWay::REVERSE) - < (top->critPathCost(GraphWay::REVERSE) + top->stepCost())) { - return false; - } - if ((fromp->critPathCost(GraphWay::FORWARD) + fromp->stepCost()) - > top->critPathCost(GraphWay::FORWARD)) { - return false; - } - - // Recursively look for a path - for (const V3GraphEdge* followp = fromp->outBeginp(); followp; - followp = followp->outNextp()) { - if (followp == excludedEdgep) continue; - LogicMTask* const nextp = static_cast(followp->top()); - if (pathExistsFromInternal(nextp, top, nullptr, generation)) return true; - } - return false; - } - - // True if there's a path from 'fromp' to 'top' excluding - // 'excludedEdgep', false otherwise. - // - // 'excludedEdgep' may be nullptr in which case no edge is excluded. If - // 'excludedEdgep' is non-nullptr it must connect fromp and top. - // - // TODO: consider changing this API to the 'isTransitiveEdge' API - // used by GraphPathChecker -public: - static bool pathExistsFrom(LogicMTask* fromp, LogicMTask* top, - const V3GraphEdge* excludedEdgep) { - return pathExistsFromInternal(fromp, top, excludedEdgep, incGeneration()); - } - - static void dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment); - -private: - VL_UNCOPYABLE(LogicMTask); -}; - -//###################################################################### -// MTask utility classes - -// Sort AbstractMTask objects into deterministic order by calling id() -// which is a unique and stable serial number. -struct MTaskIdLessThan final { - bool operator()(const AbstractMTask* lhsp, const AbstractMTask* rhsp) const { - return lhsp->id() < rhsp->id(); - } -}; - -struct MergeCandidateKey final { - // Note: Structure layout chosen to minimize padding in PairingHeao<*>::Node - uint64_t m_id; // Unique ID part of edge score - uint32_t m_score; // Score part of ID - bool operator<(const MergeCandidateKey& other) const { - // First by Score then by ID, but notice that we want minimums using a max-heap, so reverse - return m_score > other.m_score || (m_score == other.m_score && m_id > other.m_id); - } -}; - -using MergeCandidateScoreboard = V3Scoreboard; - -// Information associated with scoreboarding a merge candidate -class MergeCandidate VL_NOT_FINAL : public MergeCandidateScoreboard::Node { - // Only the known subclasses can create or delete one of these - friend class SiblingMC; - friend class MTaskEdge; - - // This structure is extremely hot. To save 8 bytes we pack - // one bit indicating removedFromSb with the id. To save another - // 8 bytes by not having a virtual function table, we implement the - // few polymorphic methods over the two known subclasses explicitly, - // using another bit of the id to denote the actual subtype. - - // By using the bottom bits for flags, we can still use < to compare IDs without masking. - // <63:1> Serial number for ordering, <0> subtype (SiblingMC) - static constexpr uint64_t IS_SIBLING_MASK = 1ULL << 0; - static constexpr uint64_t ID_INCREMENT = 1ULL << 1; - - bool isSiblingMC() const { return m_key.m_id & IS_SIBLING_MASK; } - - // CONSTRUCTORS - explicit MergeCandidate(bool isSiblingMC) { - static uint64_t serial = 0; - serial += ID_INCREMENT; // +ID_INCREMENT so doesn't set the special bottom bits - m_key.m_id = serial | (isSiblingMC * IS_SIBLING_MASK); - } - ~MergeCandidate() = default; - -public: - // METHODS - SiblingMC* toSiblingMC(); // Instead of cast<>/as<> - const SiblingMC* toSiblingMC() const; // Instead of cast<>/as<> - MTaskEdge* toMTaskEdge(); // Instead of cast<>/as<> - const MTaskEdge* toMTaskEdge() const; // Instead of cast<>/as<> - bool mergeWouldCreateCycle() const; // Instead of virtual method - - inline void rescore(); - uint32_t score() const { return m_key.m_score; } - - static MergeCandidate* heapNodeToElem(MergeCandidateScoreboard::Node* nodep) { - return static_cast(nodep); - } -}; - -static_assert(sizeof(MergeCandidate) == sizeof(MergeCandidateScoreboard::Node), - "Should not have a vtable"); - -// A pair of associated LogicMTask's that are merge candidates for sibling -// contraction -class SiblingMC final : public MergeCandidate { - LogicMTask* const m_ap; - LogicMTask* const m_bp; - - V3ListEnt m_aEnt; // List entry for m_ap->aSiblingMCs() - V3ListEnt m_bEnt; // List entry for m_bp->bSiblingMCs() - -public: - // CONSTRUCTORS - SiblingMC() = delete; - SiblingMC(LogicMTask* ap, LogicMTask* bp) - : MergeCandidate{/* isSiblingMC: */ true} - , m_ap{ap} - , m_bp{bp} { - // Storage management depends on this - UASSERT(ap->id() > bp->id(), "Should be ordered"); - UDEBUGONLY(UASSERT(ap->siblings().count(bp), "Should be in sibling map");); - m_aEnt.pushBack(m_ap->aSiblingMCs(), this); - m_bEnt.pushBack(m_bp->bSiblingMCs(), this); - } - ~SiblingMC() = default; - - // METHODS - SiblingMC* aNextp() const { return m_aEnt.nextp(); } - SiblingMC* bNextp() const { return m_bEnt.nextp(); } - void unlinkA() { - VL_ATTR_UNUSED const size_t removed = m_ap->siblings().erase(m_bp); - UDEBUGONLY(UASSERT(removed == 1, "Should have been in sibling set");); - m_aEnt.unlink(m_ap->aSiblingMCs(), this); - } - void unlinkB() { m_bEnt.unlink(m_bp->bSiblingMCs(), this); } - - LogicMTask* ap() const { return m_ap; } - LogicMTask* bp() const { return m_bp; } - bool mergeWouldCreateCycle() const { - return (LogicMTask::pathExistsFrom(m_ap, m_bp, nullptr) - || LogicMTask::pathExistsFrom(m_bp, m_ap, nullptr)); - } -}; - -static_assert(!std::is_polymorphic::value, "Should not have a vtable"); - -// GraphEdge for the MTask graph -class MTaskEdge final : public V3GraphEdge, public MergeCandidate { - VL_RTTI_IMPL(MTaskEdge, V3GraphEdge) - friend class LogicMTask; - template - friend class PartPropagateCp; - - // MEMBERS - // This edge can be in 2 EdgeHeaps, one forward and one reverse. We allocate the heap nodes - // directly within the edge as they are always required and this makes association cheap. - std::array m_edgeHeapNode; - -public: - // CONSTRUCTORS - MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight) - : V3GraphEdge{graphp, fromp, top, weight} - , MergeCandidate{/* isSiblingMC: */ false} { - fromp->addRelativeMTask(top); - fromp->addRelativeEdge(this); - top->addRelativeEdge(this); - } - // METHODS - LogicMTask* furtherMTaskp(GraphWay way) const { - return static_cast(this->furtherp(way)); - } - LogicMTask* fromMTaskp() const { return static_cast(fromp()); } - LogicMTask* toMTaskp() const { return static_cast(top()); } - bool mergeWouldCreateCycle() const { - return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this); - } - // Following initial assignment of critical paths, clear this MTaskEdge - // out of the edge-map for each node and reinsert at a new location - // with updated critical path. - void resetCriticalPaths() { - LogicMTask* const fromp = fromMTaskp(); - LogicMTask* const top = toMTaskp(); - fromp->removeRelativeEdge(this); - top->removeRelativeEdge(this); - fromp->addRelativeEdge(this); - top->addRelativeEdge(this); - } - - uint32_t cachedCp(GraphWay way) const { return m_edgeHeapNode[way].key().m_score; } - - // Convert from the address of the m_edgeHeapNode[way] in an MTaskEdge back to the MTaskEdge - static const MTaskEdge* toMTaskEdge(GraphWay way, const EdgeHeap::Node* nodep) { - const size_t offset = VL_OFFSETOF(MTaskEdge, m_edgeHeapNode[way]); - return reinterpret_cast(reinterpret_cast(nodep) - offset); - } - -private: - VL_UNCOPYABLE(MTaskEdge); -}; - -template -void LogicMTask::addRelativeEdge(MTaskEdge* edgep) { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv = way.invert(); - // Add to the edge heap - LogicMTask* const relativep = edgep->furtherMTaskp(way); - // Value is !way cp to this edge - const uint32_t cp = relativep->stepCost() + relativep->critPathCost(inv); - // - m_edgeHeap[way].insert(&edgep->m_edgeHeapNode[way], {relativep->id(), cp}); -} - -template -void LogicMTask::stealRelativeEdge(MTaskEdge* edgep) { - constexpr GraphWay way{T_Way}; - // Make heap node insertable, ruining the heap it is currently in. - edgep->m_edgeHeapNode[way].yank(); - // Add the edge as new - addRelativeEdge(edgep); -} - -template -void LogicMTask::removeRelativeEdge(MTaskEdge* edgep) { - constexpr GraphWay way{T_Way}; - // Remove from the edge heap - m_edgeHeap[way].remove(&edgep->m_edgeHeapNode[way]); -} - -void LogicMTask::checkRelativesCp(GraphWay way) const { - for (V3GraphEdge* edgep = beginp(way); edgep; edgep = edgep->nextp(way)) { - const LogicMTask* const relativep = static_cast(edgep->furtherp(way)); - const uint32_t cachedCp = static_cast(edgep)->cachedCp(way); - const uint32_t cp = relativep->critPathCost(way.invert()) + relativep->stepCost(); - partCheckCachedScoreVsActual(cachedCp, cp); - } -} - -uint32_t LogicMTask::critPathCostWithout(GraphWay way, const V3GraphEdge* withoutp) const { - // Compute the critical path cost wayward to this node, without considering edge 'withoutp'. - // We need to look at two edges at most, the critical path if that is not via 'withoutp', - // or the second-worst path, if the critical path is via 'withoutp'. -#if VL_DEBUG - UASSERT(withoutp->furtherp(way) == this, - "In critPathCostWithout(), edge 'withoutp' must further to 'this'"); -#endif - const GraphWay inv = way.invert(); - const EdgeHeap& edgeHeap = m_edgeHeap[inv]; - const EdgeHeap::Node* const maxp = edgeHeap.max(); - if (!maxp) return 0; - if (MTaskEdge::toMTaskEdge(inv, maxp) != withoutp) return maxp->key().m_score; - const EdgeHeap::Node* const secp = edgeHeap.secondMax(); - if (!secp) return 0; - return secp->key().m_score; -} - -void LogicMTask::dumpCpFilePrefixed(const V3Graph* graphp, const string& nameComment) { - const string filename = v3Global.debugFilename(nameComment) + ".txt"; - UINFO(1, "Writing " << filename << endl); - const std::unique_ptr ofp{V3File::new_ofstream(filename)}; - std::ostream* const osp = &(*ofp); // &* needed to deref unique_ptr - if (osp->fail()) v3fatalStatic("Can't write " << filename); - - // Find start vertex with longest CP - LogicMTask* startp = nullptr; - for (V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - LogicMTask* const mtaskp = static_cast(vxp); - if (!startp) { - startp = mtaskp; - continue; - } - if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE) - > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) { - startp = mtaskp; - } - } - - // Follow the entire critical path - std::vector path; - uint32_t totalCost = 0; - for (LogicMTask* nextp = startp; nextp;) { - path.push_back(nextp); - totalCost += nextp->cost(); - - if (EdgeHeap::Node* const maxp = nextp->m_edgeHeap[GraphWay::FORWARD].max()) { - nextp = MTaskEdge::toMTaskEdge(GraphWay::FORWARD, maxp)->toMTaskp(); - } else { - nextp = nullptr; - } - } - - *osp << "totalCost = " << totalCost - << " (should match the computed critical path cost (CP) for the graph)\n"; - - // Dump - for (const LogicMTask* mtaskp : path) { - *osp << "begin mtask with cost " << mtaskp->cost() << '\n'; - for (VxList::const_iterator lit = mtaskp->vertexListp()->begin(); - lit != mtaskp->vertexListp()->end(); ++lit) { - const OrderLogicVertex* const logicp = (*lit)->logicp(); - if (!logicp) continue; - if (false) { - // Show nodes only - *osp << "> "; - logicp->nodep()->dumpTree(*osp); - } else { - // Show nodes with hierarchical costs - V3InstrCount::count(logicp->nodep(), false, osp); - } - } - } -} - -// Instead of dynamic cast -SiblingMC* MergeCandidate::toSiblingMC() { - return isSiblingMC() ? static_cast(this) : nullptr; -} - -MTaskEdge* MergeCandidate::toMTaskEdge() { - return isSiblingMC() ? nullptr : static_cast(this); -} - -const SiblingMC* MergeCandidate::toSiblingMC() const { - return isSiblingMC() ? static_cast(this) : nullptr; -} - -const MTaskEdge* MergeCandidate::toMTaskEdge() const { - return isSiblingMC() ? nullptr : static_cast(this); -} - -// Normally this would be a virtual function, but we save space by not having a vtable, -// and we know we only have 2 possible subclasses. -bool MergeCandidate::mergeWouldCreateCycle() const { - return isSiblingMC() ? static_cast(this)->mergeWouldCreateCycle() - : static_cast(this)->mergeWouldCreateCycle(); -} - -static uint32_t siblingScore(const SiblingMC* sibsp) { - const LogicMTask* const ap = sibsp->ap(); - const LogicMTask* const bp = sibsp->bp(); - const uint32_t mergedCpCostFwd - = std::max(ap->critPathCost(GraphWay::FORWARD), bp->critPathCost(GraphWay::FORWARD)); - const uint32_t mergedCpCostRev - = std::max(ap->critPathCost(GraphWay::REVERSE), bp->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(ap->cost() + bp->cost()); -} - -static uint32_t edgeScore(const MTaskEdge* edgep) { - // Score this edge. Lower is better. The score is the new local CP - // length if we merge these mtasks. ("Local" means the longest - // critical path running through the merged node.) - const LogicMTask* const top = static_cast(edgep->top()); - const LogicMTask* const fromp = static_cast(edgep->fromp()); - const uint32_t mergedCpCostFwd = std::max(fromp->critPathCost(GraphWay::FORWARD), - top->critPathCostWithout(GraphWay::FORWARD, edgep)); - const uint32_t mergedCpCostRev = std::max(fromp->critPathCostWithout(GraphWay::REVERSE, edgep), - top->critPathCost(GraphWay::REVERSE)); - return mergedCpCostRev + mergedCpCostFwd + LogicMTask::stepCost(fromp->cost() + top->cost()); -} - -void MergeCandidate::rescore() { - if (const SiblingMC* const sibp = toSiblingMC()) { - m_key.m_score = siblingScore(sibp); - } else { - // The '1 +' favors merging a SiblingMC over an otherwise- - // equal-scoring MTaskEdge. The comment on selfTest() talks - // about why. - m_key.m_score = 1 + edgeScore(static_cast(this)); - } -} - -//###################################################################### - -// Look at vertex costs (in one way) to form critical paths for each -// vertex. -static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool checkOnly) { - GraphStreamUnordered order(mtasksp, way); - const GraphWay rev = way.invert(); - for (const V3GraphVertex* vertexp; (vertexp = order.nextp());) { - const LogicMTask* const mtaskcp = static_cast(vertexp); - LogicMTask* const mtaskp = const_cast(mtaskcp); - uint32_t cpCost = 0; -#if VL_DEBUG - std::unordered_set relatives; -#endif - for (V3GraphEdge* edgep = vertexp->beginp(rev); edgep; edgep = edgep->nextp(rev)) { -#if VL_DEBUG - // Run a few asserts on the initial mtask graph, - // while we're iterating through... - UASSERT_OBJ(edgep->weight() != 0, mtaskp, "Should be no cut edges in mtasks graph"); - UASSERT_OBJ(relatives.find(edgep->furtherp(rev)) == relatives.end(), mtaskp, - "Should be no redundant edges in mtasks graph"); - relatives.insert(edgep->furtherp(rev)); -#endif - const LogicMTask* const relativep = static_cast(edgep->furtherp(rev)); - cpCost = std::max(cpCost, (relativep->critPathCost(way) - + static_cast(relativep->stepCost()))); - } - if (checkOnly) { - partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost); - } else { - mtaskp->setCritPathCost(way, cpCost); - } - } -} - -// Look at vertex costs to form critical paths for each vertex. -static void partInitCriticalPaths(V3Graph* mtasksp) { - partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false); - partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false); - - // Reset all MTaskEdges so that 'm_edges' will show correct CP numbers. - // They would have been all zeroes on initial creation of the MTaskEdges. - for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - for (V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { - MTaskEdge* const mtedgep = edgep->as(); - mtedgep->resetCriticalPaths(); - } - } -} - -// Do an EXPENSIVE check to make sure that all incremental CP updates have -// gone correctly. -static void partCheckCriticalPaths(V3Graph* mtasksp) { - partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true); - partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true); - for (V3GraphVertex* vxp = mtasksp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - const LogicMTask* const mtaskp = static_cast(vxp); - mtaskp->checkRelativesCp(GraphWay::FORWARD); - mtaskp->checkRelativesCp(GraphWay::REVERSE); - } -} - -// ###################################################################### -// PartPropagateCp - -// Propagate increasing critical path (CP) costs through a graph. -// -// Usage: -// * Client increases the cost and/or CP at a node or small set of nodes -// (often a pair in practice, eg. edge contraction.) -// * Client calls PartPropagateCp::cpHasIncreased() one or more times. -// Each call indicates that the inclusive CP of some "seed" vertex -// has increased to a given value. -// * NOTE: PartPropagateCp will neither read nor modify the cost -// or CPs at the seed vertices, it only accesses and modifies -// vertices wayward from the seeds. -// * Client calls PartPropagateCp::go(). Internally, this iteratively -// propagates the new CPs wayward through the graph. -// -template -class PartPropagateCp final { - // TYPES - - // We keep pending vertices in a heap during critical path propagation - struct PendingKey final { - LogicMTask* m_mtaskp; // The vertex in the heap - uint32_t m_score; // The score of this entry - void increase(uint32_t score) { -#if VL_DEBUG - UASSERT(score >= m_score, "Must increase"); -#endif - m_score = score; - } - bool operator<(const PendingKey& other) const { - if (m_score != other.m_score) return m_score < other.m_score; - return LogicMTask::CmpLogicMTask{}(m_mtaskp, other.m_mtaskp); - } - }; - - using PendingHeap = PairingHeap; - using PendingHeapNode = typename PendingHeap::Node; - - // MEMBERS - PendingHeap m_pendingHeap; // Heap of pending rescores - - // We allocate this many heap nodes at once - static constexpr size_t ALLOC_CHUNK_SIZE = 128; - PendingHeapNode* m_freep = nullptr; // List of free heap nodes - std::vector> m_allocated; // Allocated heap nodes - - const bool m_slowAsserts; // Enable nontrivial asserts - std::set m_seen; // Used only with slow asserts to check mtasks visited only once - -public: - // CONSTRUCTORS - explicit PartPropagateCp(bool slowAsserts) - : m_slowAsserts{slowAsserts} {} - - // METHODS -private: - // Allocate a HeapNode for the given element - PendingHeapNode* allocNode() { - // If no free nodes available, then make some - if (!m_freep) { - // Allocate in chunks for efficiency - m_allocated.emplace_back(new PendingHeapNode[ALLOC_CHUNK_SIZE]); - // Set up free list pointer - m_freep = m_allocated.back().get(); - // Set up free list chain - for (size_t i = 1; i < ALLOC_CHUNK_SIZE; ++i) { - m_freep[i - 1].m_next.m_ptr = &m_freep[i]; - } - // Clear the next pointer of the last entry - m_freep[ALLOC_CHUNK_SIZE - 1].m_next.m_ptr = nullptr; - } - // Free nodes are available, pick up the first one - PendingHeapNode* const resultp = m_freep; - m_freep = resultp->m_next.m_ptr; - resultp->m_next.m_ptr = nullptr; - return resultp; - } - - // Release a heap node (make it available for future allocation) - void freeNode(PendingHeapNode* nodep) { - // Re-use the existing link pointers and simply prepend it to the free list - nodep->m_next.m_ptr = m_freep; - m_freep = nodep; - } - -public: - void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv{way.invert()}; - - // For *vxp, whose CP-inclusive has just increased to - // newInclusiveCp, iterate to all wayward nodes, update the edges - // of each, and add each to m_pending if its overall CP has grown. - for (MTaskEdge *edgep = static_cast(vxp->beginp(way)), *nextp; edgep; - edgep = nextp) { - // Fetch early as likely cache miss - nextp = static_cast(edgep->nextp(way)); - - LogicMTask* const relativep = edgep->furtherMTaskp(way); - EdgeHeap::Node& edgeHeapNode = edgep->m_edgeHeapNode[inv]; - if (newInclusiveCp > edgeHeapNode.key().m_score) { - relativep->m_edgeHeap[inv].increaseKey(&edgeHeapNode, newInclusiveCp); - } - - const uint32_t critPathCost = relativep->critPathCost(way); - - if (critPathCost >= newInclusiveCp) continue; - - // relativep's critPathCost() is out of step with its longest !wayward edge. - // Schedule that to be resolved. - const uint32_t newVal = newInclusiveCp - critPathCost; - - if (PendingHeapNode* const nodep = static_cast(relativep->userp())) { - // Already in heap. Increase score if needed. - if (newVal > nodep->key().m_score) m_pendingHeap.increaseKey(nodep, newVal); - continue; - } - - // Add to heap - PendingHeapNode* const nodep = allocNode(); - relativep->userp(nodep); - m_pendingHeap.insert(nodep, {relativep, newVal}); - } - } - - void go() { - constexpr GraphWay way{T_Way}; - constexpr GraphWay inv{way.invert()}; - - // m_pending maps each pending vertex to the amount that it wayward - // CP will grow. - // - // We can iterate over the pending set in reverse order, always - // choosing the nodes with the largest pending CP-growth. - // - // The intuition is: if the original seed node had its CP grow by - // 50, the most any wayward node can possibly grow is also 50. So - // for anything pending to grow by 50, we know we can process it - // once and we won't have to grow its CP again on the current pass. - // After we're done with all the grow-by-50s, nothing else will - // grow by 50 again on the current pass, and we can process the - // grow-by-49s and we know we'll only have to process each one - // once. And so on. - // - // This generalizes to multiple seed nodes also. - while (!m_pendingHeap.empty()) { - // Pop max element from heap - PendingHeapNode* const maxp = m_pendingHeap.max(); - m_pendingHeap.remove(maxp); - // Pick up values - LogicMTask* const mtaskp = maxp->key().m_mtaskp; - const uint32_t cpGrowBy = maxp->key().m_score; - // Free the heap node, we are done with it - freeNode(maxp); - mtaskp->userp(nullptr); - // Update the critPathCost of mtaskp, that was out-of-date with respect to its edges - const uint32_t startCp = mtaskp->critPathCost(way); - const uint32_t newCp = startCp + cpGrowBy; - if (VL_UNLIKELY(m_slowAsserts)) { - // Check that CP matches that of the longest edge wayward of vxp. - const uint32_t edgeCp = mtaskp->m_edgeHeap[inv].max()->key().m_score; - UASSERT_OBJ(edgeCp == newCp, mtaskp, "CP doesn't match longest wayward edge"); - // Confirm that we only set each node's CP once. That's an - // important property of PartPropagateCp which allows it to be far - // faster than a recursive algorithm on some graphs. - const bool first = m_seen.insert(mtaskp).second; - UASSERT_OBJ(first, mtaskp, "Set CP on node twice"); - } - mtaskp->setCritPathCost(way, newCp); - cpHasIncreased(mtaskp, newCp + mtaskp->stepCost()); - } - - if (VL_UNLIKELY(m_slowAsserts)) m_seen.clear(); - } - -private: - VL_UNCOPYABLE(PartPropagateCp); -}; - -class PartPropagateCpSelfTest final { - // MEMBERS - V3Graph m_graph; // A graph - std::array m_vx; // All vertices within the graph - - // CONSTRUCTORS - PartPropagateCpSelfTest() = default; - ~PartPropagateCpSelfTest() = default; - - void go() { - // Generate a pseudo-random graph - std::array rngState - = {{0x12345678ULL, 0x9abcdef0ULL}}; // GCC 3.8.0 wants {{}} - // Create 50 vertices - for (auto& i : m_vx) { - i = new LogicMTask{&m_graph, nullptr}; - i->setCost(1); - } - // Create 250 edges at random. Edges must go from - // lower-to-higher index vertices, so we get a DAG. - for (unsigned i = 0; i < 250; ++i) { - const unsigned idx1 = V3Os::rand64(rngState) % 50; - const unsigned idx2 = V3Os::rand64(rngState) % 50; - if (idx1 > idx2) { - if (!m_vx[idx2]->hasRelativeMTask(m_vx[idx1])) { - new MTaskEdge{&m_graph, m_vx[idx2], m_vx[idx1], 1}; - } - } else if (idx2 > idx1) { - if (!m_vx[idx1]->hasRelativeMTask(m_vx[idx2])) { - new MTaskEdge{&m_graph, m_vx[idx1], m_vx[idx2], 1}; - } - } - } - - partInitCriticalPaths(&m_graph); - - // This SelfTest class is also the T_CostAccessor - PartPropagateCp prop(true); - - // Seed the propagator with every input node; - // This should result in the complete graph getting all CP's assigned. - for (const auto& i : m_vx) { - if (!i->inBeginp()) prop.cpHasIncreased(i, 1 /* inclusive CP starts at 1 */); - } - - // Run the propagator. - prop.go(); - - // Finally, confirm that the entire graph appears to have correct CPs. - partCheckCriticalPaths(&m_graph); - } - -public: - static void selfTest() { PartPropagateCpSelfTest{}.go(); } -}; - -// Merge edges from a LogicMtask. -// -// This code removes adjacent edges. When this occurs, mark it in need -// of a rescore, in case its score has fallen and we need to move it up -// toward the front of the scoreboard. -// -// Wait, what? Shouldn't the scores only increase as we merge nodes? Well -// that's almost true. But there is one exception. -// -// Suppose we have A->B, B->C, and A->C. -// -// The A->C edge is a "transitive" edge. It's ineligible to be merged, as -// the merge would create a cycle. We score it on the scoreboard like any -// other edge. -// -// However, our "score" estimate for A->C is bogus, because the forward -// critical path to C and the reverse critical path to A both contain the -// same node (B) so we overestimate the score of A->C. At first this -// doesn't matter, since transitive edges aren't eligible to merge anyway. -// -// Later, suppose the edge contractor decides to merge the B->C edge, with -// B donating all its incoming edges into C, say. (So we reach this -// function.) -// -// With B going away, the A->C edge will no longer be transitive and it -// will become eligible to merge. But if we don't mark it for rescore, -// it'll stay in the scoreboard with its old (overestimate) score. We'll -// merge it too late due to the bogus score. When we finally merge it, we -// fail the assert in the main edge contraction loop which checks that the -// actual score did not fall below the scoreboard's score. -// -// Another way of stating this: this code ensures that scores of -// non-transitive edges only ever increase. -static void partRedirectEdgesFrom(V3Graph* graphp, LogicMTask* recipientp, LogicMTask* donorp, - MergeCandidateScoreboard* sbp) { - - // Process outgoing edges - MTaskEdge* outNextp = static_cast(donorp->outBeginp()); - while (outNextp) { - MTaskEdge* const edgep = outNextp; - LogicMTask* const relativep = outNextp->toMTaskp(); - outNextp = static_cast(outNextp->outNextp()); - - relativep->removeRelativeEdge(edgep); - - if (recipientp->hasRelativeMTask(relativep)) { - // An edge already exists between recipient and relative of donor. - // Mark it in need of a rescore - if (sbp) { - if (sbp->contains(edgep)) sbp->remove(edgep); - MTaskEdge* const existMTaskEdgep = static_cast( - recipientp->findConnectingEdgep(GraphWay::FORWARD, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif - if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); - } - VL_DO_DANGLING(edgep->unlinkDelete(), edgep); - } else { - // No existing edge between recipient and relative of donor. - // Redirect the edge from donor<->relative to recipient<->relative. - edgep->relinkFromp(recipientp); - recipientp->addRelativeMTask(relativep); - recipientp->stealRelativeEdge(edgep); - relativep->addRelativeEdge(edgep); - if (sbp) { - if (!sbp->contains(edgep)) { - sbp->add(edgep); - } else { - sbp->hintScoreChanged(edgep); - } - } - } - } - - // Process incoming edges - MTaskEdge* inNextp = static_cast(donorp->inBeginp()); - while (inNextp) { - MTaskEdge* const edgep = inNextp; - LogicMTask* const relativep = inNextp->fromMTaskp(); - inNextp = static_cast(inNextp->inNextp()); - - relativep->removeRelativeMTask(donorp); - relativep->removeRelativeEdge(edgep); - - if (relativep->hasRelativeMTask(recipientp)) { - // An edge already exists between recipient and relative of donor. - // Mark it in need of a rescore - if (sbp) { - if (sbp->contains(edgep)) sbp->remove(edgep); - MTaskEdge* const existMTaskEdgep = static_cast( - recipientp->findConnectingEdgep(GraphWay::REVERSE, relativep)); -#if VL_DEBUG - UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge"); -#endif - if (sbp->contains(existMTaskEdgep)) sbp->hintScoreChanged(existMTaskEdgep); - } - VL_DO_DANGLING(edgep->unlinkDelete(), edgep); - } else { - // No existing edge between recipient and relative of donor. - // Redirect the edge from donor<->relative to recipient<->relative. - edgep->relinkTop(recipientp); - relativep->addRelativeMTask(recipientp); - relativep->addRelativeEdge(edgep); - recipientp->stealRelativeEdge(edgep); - if (sbp) { - if (!sbp->contains(edgep)) { - sbp->add(edgep); - } else { - sbp->hintScoreChanged(edgep); - } - } - } - } - - // Remove donorp from the graph - VL_DO_DANGLING(donorp->unlinkDelete(graphp), donorp); -} - -//###################################################################### -// PartContraction - -// Perform edge or sibling contraction on the partition graph -class PartContraction final { - // TYPES - // New CP information for mtaskp reflecting an upcoming merge - struct NewCp final { - uint32_t cp; - uint32_t propagateCp; - bool propagate; - }; - - // MEMBERS - V3Graph* const m_mtasksp; // Mtask graph - uint32_t m_scoreLimit; // Sloppy score allowed when picking merges - uint32_t m_scoreLimitBeforeRescore = 0xffffffff; // Next score rescore at - unsigned m_mergesSinceRescore = 0; // Merges since last rescore - const bool m_slowAsserts; // Take extra time to validate algorithm - MergeCandidateScoreboard m_sb; // Scoreboard - - PartPropagateCp m_forwardPropagator{m_slowAsserts}; // Forward propagator - PartPropagateCp m_reversePropagator{m_slowAsserts}; // Reverse propagator - - LogicMTask* const m_entryMTaskp; // Singular source vertex of the dependency graph - LogicMTask* const m_exitMTaskp; // Singular sink vertex of the dependency graph - -public: - // CONSTRUCTORS - PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, LogicMTask* entryMTaskp, - LogicMTask* exitMTaskp, bool slowAsserts) - : m_mtasksp{mtasksp} - , m_scoreLimit{scoreLimit} - , m_slowAsserts{slowAsserts} - , m_entryMTaskp{entryMTaskp} - , m_exitMTaskp{exitMTaskp} {} - - // METHODS - void go() { - if (m_slowAsserts) { - // Check there are no redundant edges - for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; - itp = itp->verticesNextp()) { - std::unordered_set neighbors; - for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const bool first = neighbors.insert(edgep->top()).second; - UASSERT_OBJ(first, itp, "Redundant edge found in input to PartContraction()"); - } - } - } - - unsigned maxMTasks = v3Global.opt.threadsMaxMTasks(); - if (maxMTasks == 0) { // Unspecified so estimate - if (v3Global.opt.threads() > 1) { - maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD * v3Global.opt.threads()); - } else { - // Running PartContraction with --threads <= 1 means self-test - maxMTasks = 500; - } - } - - // OPTIMIZATION PASS: Edge contraction and sibling contraction. - // - Score each pair of mtasks which is a candidate to merge. - // * Each edge defines such a candidate pair - // * Two mtasks that are prereqs or postreqs of a common third - // vertex are "siblings", these are also a candidate pair. - // - Build a list of MergeCandidates, sorted by score. - // - Merge the best pair. - // - Incrementally recompute critical paths near the merged mtask. - - for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - itp->userp(nullptr); // Reset user value while we are here. Used by PartPropagateCp. - for (V3GraphEdge* edgep = itp->outBeginp(); edgep; edgep = edgep->outNextp()) { - m_sb.add(static_cast(edgep)); - } - siblingPairFromRelatives(itp); - siblingPairFromRelatives(itp); - } - - doRescore(); // Set initial scores in scoreboard - - while (true) { - // This is the best edge to merge, with the lowest - // score (shortest local critical path) - MergeCandidate* const mergeCanp = m_sb.best(); - if (!mergeCanp) { - // Scoreboard found no eligible merges. Maybe a rescore - // will produce some merge-able pairs? - if (m_sb.needsRescore()) { - doRescore(); - continue; - } - break; - } - - if (m_slowAsserts) { - UASSERT(!m_sb.needsRescore(mergeCanp), - "Need-rescore items should not be returned by bestp"); - } - const uint32_t cachedScore = mergeCanp->score(); - mergeCanp->rescore(); - const uint32_t actualScore = mergeCanp->score(); - - if (actualScore > cachedScore) { - // Cached score is out-of-date. - // Mark this elem as in need of a rescore and continue. - m_sb.hintScoreChanged(mergeCanp); - continue; - } - // ... we'll also confirm that actualScore hasn't shrunk relative - // to cached score, after the mergeWouldCreateCycle() check. - - if (actualScore > m_scoreLimit) { - // Our best option isn't good enough - if (m_sb.needsRescore()) { - // Some pairs need a rescore, maybe those will be - // eligible to merge afterward. - doRescore(); - continue; - } else { - // We've exhausted everything below m_scoreLimit; stop. - - // Except, if we have too many mtasks, raise the score - // limit and keep going... - unsigned mtaskCount = 0; - for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp(); vxp; - vxp = vxp->verticesNextp()) { - ++mtaskCount; - } - if (mtaskCount > maxMTasks) { - const uint32_t oldLimit = m_scoreLimit; - m_scoreLimit = (m_scoreLimit * 120) / 100; - v3Global.rootp()->fileline()->v3warn( - UNOPTTHREADS, "Thread scheduler is unable to provide requested " - "parallelism; suggest asking for fewer threads."); - UINFO(1, "Critical path limit was=" << oldLimit << " now=" << m_scoreLimit - << endl); - continue; - } - // Really stop - break; - } - } - if (actualScore > m_scoreLimitBeforeRescore) { - // Time to rescore, that will result in a higher - // scoreLimitBeforeRescore, and possibly lower-scoring - // elements returned from bestp(). - doRescore(); - continue; - } - - // Avoid merging the entry/exit nodes. This would create serialization, by forcing the - // merged MTask to run before/after everything else. Empirically this helps - // performance in a modest way by allowing other MTasks to start earlier. - if (MTaskEdge* const edgep = mergeCanp->toMTaskEdge()) { - if (edgep->fromp() == m_entryMTaskp || edgep->top() == m_exitMTaskp) { - m_sb.remove(mergeCanp); - continue; - } - } - - // Avoid merging any edge that would create a cycle. - // - // For example suppose we begin with vertices A, B, C and edges - // A->B, B->C, A->C. - // - // Suppose we want to merge A->C into a single vertex. - // New edges would be AC->B and B->AC which is not a DAG. - // Do not allow this. - if (mergeCanp->mergeWouldCreateCycle()) { - // Remove this candidate from scoreboard so we don't keep - // reconsidering it on every loop. - m_sb.remove(mergeCanp); - if (SiblingMC* const smcp = mergeCanp->toSiblingMC()) { - smcp->unlinkA(); - smcp->unlinkB(); - delete smcp; - } - continue; - } - - partCheckCachedScoreVsActual(cachedScore, actualScore); - - // Finally there's no cycle risk, no need to rescore, we're - // within m_scoreLimit and m_scoreLimitBeforeRescore. - // This is the edge to merge. - // - // Bookkeeping: if this is the first edge we'll merge since - // the last rescore, compute the new m_scoreLimitBeforeRescore - // to be somewhat higher than this edge's score. - if (m_mergesSinceRescore == 0) { -#if PART_STEPPED_RESCORELIMIT - m_scoreLimitBeforeRescore = (actualScore * 105) / 100; -#else - m_scoreLimitBeforeRescore = actualScore; -#endif - - // This print can serve as a progress indicator, as it - // increases from low numbers up toward cpLimit. It may be - // helpful to see progress during slow partitions. Maybe - // display something by default even? - UINFO(6, "New scoreLimitBeforeRescore: " << m_scoreLimitBeforeRescore << endl); - } - - // Finally merge this candidate. - contract(mergeCanp); - } - } - -private: - template - NewCp newCp(LogicMTask* mtaskp, LogicMTask* otherp, MTaskEdge* mergeEdgep) { - constexpr GraphWay way{T_Way}; - // Return new wayward-CP for mtaskp reflecting its upcoming merge - // with otherp. Set 'result.propagate' if mtaskp's wayward - // relatives will see a new wayward CP from this merge. - uint32_t newCp; - if (mergeEdgep) { - if (mtaskp == mergeEdgep->furtherp(way)) { - newCp = std::max(otherp->critPathCost(way), - mtaskp->critPathCostWithout(way, mergeEdgep)); - } else { - newCp = std::max(mtaskp->critPathCost(way), - otherp->critPathCostWithout(way, mergeEdgep)); - } - } else { - newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way)); - } - - const uint32_t origRelativesCp = mtaskp->critPathCost(way) + mtaskp->stepCost(); - const uint32_t newRelativesCp - = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost()); - - NewCp result; - result.cp = newCp; - result.propagate = (newRelativesCp > origRelativesCp); - result.propagateCp = newRelativesCp; - return result; - } - - void removeSiblingMCsWith(LogicMTask* mtaskp) { - for (SiblingMC *smcp = mtaskp->aSiblingMCs().begin(), *nextp; // lintok-begin-on-ref - smcp; smcp = nextp) { - nextp = smcp->aNextp(); - m_sb.remove(smcp); - smcp->unlinkB(); - delete smcp; - } - for (SiblingMC *smcp = mtaskp->bSiblingMCs().begin(), *nextp; // lintok-begin-on-ref - smcp; smcp = nextp) { - nextp = smcp->bNextp(); - m_sb.remove(smcp); - smcp->unlinkA(); - delete smcp; - } - } - - void removeSiblingMCs(LogicMTask* recipientp, LogicMTask* donorp) { - // The lists here should be disjoint (there should be only one SiblingMC involving these - // two MTasks, and we removed that elsewhere), so no need for unlinking from the lists we - // are clearing. - removeSiblingMCsWith(recipientp); - removeSiblingMCsWith(donorp); - - // Clear the sibling map of the recipient. The donor will be deleted anyway, so we can - // leave that in a corrupt for efficiency. - recipientp->siblings().clear(); - recipientp->aSiblingMCs().reset(); - recipientp->bSiblingMCs().reset(); - } - - void contract(MergeCandidate* mergeCanp) { - LogicMTask* top = nullptr; - LogicMTask* fromp = nullptr; - MTaskEdge* const mergeEdgep = mergeCanp->toMTaskEdge(); - SiblingMC* const mergeSibsp = mergeCanp->toSiblingMC(); - if (mergeEdgep) { - top = static_cast(mergeEdgep->top()); - fromp = static_cast(mergeEdgep->fromp()); - } else { - top = mergeSibsp->ap(); - fromp = mergeSibsp->bp(); - } - - // Merge the smaller mtask into the larger mtask. If one of them - // is much larger, this will save time in partRedirectEdgesFrom(). - // Assume the more costly mtask has more edges. - // - // [TODO: now that we have edge maps, we could count the edges - // exactly without a linear search.] - LogicMTask* recipientp; - LogicMTask* donorp; - if (fromp->cost() > top->cost()) { - recipientp = fromp; - donorp = top; - } else { - donorp = fromp; - recipientp = top; - } - VL_DANGLING(fromp); - VL_DANGLING(top); // Use donorp and recipientp now instead - - // Recursively update forward and reverse CP numbers. - // - // Doing this before merging the mtasks lets us often avoid - // recursing through either incoming or outgoing edges on one or - // both mtasks. - // - // These 'NewCp' objects carry a bit indicating whether we must - // propagate CP for each of the four cases: - const NewCp recipientNewCpFwd = newCp(recipientp, donorp, mergeEdgep); - const NewCp donorNewCpFwd = newCp(donorp, recipientp, mergeEdgep); - const NewCp recipientNewCpRev = newCp(recipientp, donorp, mergeEdgep); - const NewCp donorNewCpRev = newCp(donorp, recipientp, mergeEdgep); - - m_sb.remove(mergeCanp); - - if (mergeEdgep) { - // Remove and free the connecting edge. Must do this before propagating CP's below. - mergeEdgep->fromMTaskp()->removeRelativeMTask(mergeEdgep->toMTaskp()); - mergeEdgep->fromMTaskp()->removeRelativeEdge(mergeEdgep); - mergeEdgep->toMTaskp()->removeRelativeEdge(mergeEdgep); - VL_DO_DANGLING(mergeEdgep->unlinkDelete(), mergeEdgep); - } else { - // Remove the siblingMC - mergeSibsp->unlinkA(); - mergeSibsp->unlinkB(); - VL_DO_DANGLING(delete mergeEdgep, mergeEdgep); - } - - // This also updates cost and stepCost on recipientp - recipientp->moveAllVerticesFrom(donorp); - - UINFO(9, "recipient = " << recipientp->id() << ", donor = " << donorp->id() - << ", mergeEdgep = " << mergeEdgep << "\n" - << "recipientNewCpFwd = " << recipientNewCpFwd.cp - << (recipientNewCpFwd.propagate ? " true " : " false ") - << recipientNewCpFwd.propagateCp << "\n" - << "donorNewCpFwd = " << donorNewCpFwd.cp - << (donorNewCpFwd.propagate ? " true " : " false ") - << donorNewCpFwd.propagateCp << endl); - - recipientp->setCritPathCost(GraphWay::FORWARD, recipientNewCpFwd.cp); - if (recipientNewCpFwd.propagate) { - m_forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp); - } - recipientp->setCritPathCost(GraphWay::REVERSE, recipientNewCpRev.cp); - if (recipientNewCpRev.propagate) { - m_reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp); - } - if (donorNewCpFwd.propagate) { - m_forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp); - } - if (donorNewCpRev.propagate) { - m_reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp); - } - m_forwardPropagator.go(); - m_reversePropagator.go(); - - // Remove all other SiblingMCs that include recipientp or donorp. We remove all siblingMCs - // of recipientp so we do not get huge numbers of SiblingMCs. We'll recreate them below, up - // to a bounded number. - removeSiblingMCs(recipientp, donorp); - - // Redirect all edges, delete donorp - partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb); - - ++m_mergesSinceRescore; - - // Do an expensive check, confirm we haven't botched the CP - // updates. - if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp); - - // Finally, make new sibling pairs as needed: - // - prereqs and postreqs of recipientp - // - prereqs of recipientp's postreqs - // - postreqs of recipientp's prereqs - // Note that this depends on the updated critical paths (above). - siblingPairFromRelatives(recipientp); - siblingPairFromRelatives(recipientp); - unsigned edges = 0; - for (V3GraphEdge* edgep = recipientp->outBeginp(); edgep; edgep = edgep->outNextp()) { - LogicMTask* const postreqp = static_cast(edgep->top()); - siblingPairFromRelatives(postreqp); - ++edges; - if (edges >= PART_SIBLING_EDGE_LIMIT) break; - } - edges = 0; - for (V3GraphEdge* edgep = recipientp->inBeginp(); edgep; edgep = edgep->inNextp()) { - LogicMTask* const prereqp = static_cast(edgep->fromp()); - siblingPairFromRelatives(prereqp); - ++edges; - if (edges >= PART_SIBLING_EDGE_LIMIT) break; - } - } - - void doRescore() { - // During rescore, we know that graph isn't changing, so allow - // the critPathCost*Without() routines to cache some data in - // each LogicMTask. This is just an optimization, things should - // behave identically without the caching (just slower) - - m_sb.rescore(); - UINFO(6, "Did rescore. Merges since previous = " << m_mergesSinceRescore << endl); - - m_mergesSinceRescore = 0; - m_scoreLimitBeforeRescore = 0xffffffff; - } - - void makeSiblingMC(LogicMTask* ap, LogicMTask* bp) { - if (ap->id() < bp->id()) std::swap(ap, bp); - // The higher id vertex owns the association set - const auto first = ap->siblings().insert(bp).second; - if (first) { - m_sb.add(new SiblingMC{ap, bp}); - } else if (VL_UNLIKELY(m_slowAsserts)) { - // It's fine if we already have this SiblingMC, we may have - // created it earlier. Just confirm that we have associated data. - bool found = false; - for (const SiblingMC* smcp = ap->aSiblingMCs().begin(); // lintok-begin-on-ref - smcp; smcp = smcp->aNextp()) { - UASSERT_OBJ(smcp->ap() == ap, ap, "Inconsistent SiblingMC"); - UASSERT_OBJ(m_sb.contains(smcp), ap, "Must be on the scoreboard"); - if (smcp->bp() == bp) found = true; - } - UASSERT_OBJ(found, ap, "Sibling not found"); - } - } - - template - void siblingPairFromRelatives(V3GraphVertex* mtaskp) { - constexpr GraphWay way{T_Way}; - // Need at least 2 edges - if (!mtaskp->beginp(way) || !mtaskp->beginp(way)->nextp(way)) return; - - std::array neighbors; - - // This is a hot method, so we want so sort as efficiently as possible. We pre-load - // all data (critical path cost and id) required for determining ordering into an aligned - // structure. There is not enough space next to these to keep a whole pointer within 16 - // bytes, so we store an index into the neighbors buffer instead. We can then compare - // and swap these sorting records very efficiently. With this the standard library sorting - // functions are efficient enough and using more optimized methods (e.g.: sorting networks) - // has no measurable benefit. - struct alignas(16) SortingRecord final { - uint64_t m_id; - uint32_t m_cp; - uint8_t m_idx; - static_assert(PART_SIBLING_EDGE_LIMIT <= std::numeric_limits::max(), - "m_idx must fit all indices into 'neighbors'"); - bool operator<(const SortingRecord& that) const { - return m_cp < that.m_cp || (m_cp == that.m_cp && m_id < that.m_id); - } - }; - static_assert(sizeof(SortingRecord) <= 16, "How could this be padded to more than 16?"); - - std::array sortRecs; - size_t n = 0; - - // Populate the buffers - for (V3GraphEdge *edgep = mtaskp->beginp(way), *nextp; edgep; edgep = nextp) { - nextp = edgep->nextp(way); // Fetch next first as likely cache miss - LogicMTask* const otherp = static_cast(edgep->furtherp(way)); - neighbors[n] = otherp; - sortRecs[n].m_id = otherp->id(); - sortRecs[n].m_cp = otherp->critPathCost(way) + otherp->cost(); - sortRecs[n].m_idx = n; - ++n; - // Prevent nodes with huge numbers of edges from massively slowing down us down - if (n >= PART_SIBLING_EDGE_LIMIT) break; - } - - // Don't make all possible pairs of siblings when not requested (non-exhaustive). - // Just make a few pairs. - constexpr size_t MAX_NONEXHAUSTIVE_PAIRS = 3; - - if (Exhaustive || n <= 2 * MAX_NONEXHAUSTIVE_PAIRS) { - const size_t end = n & ~static_cast(1); // Round down to even, (we want pairs) - std::sort(sortRecs.begin(), sortRecs.begin() + n); - for (size_t i = 0; i < end; i += 2) { - makeSiblingMC(neighbors[sortRecs[i].m_idx], neighbors[sortRecs[i + 1].m_idx]); - } - } else { - constexpr size_t end = 2 * MAX_NONEXHAUSTIVE_PAIRS; - std::partial_sort(sortRecs.begin(), sortRecs.begin() + end, sortRecs.begin() + n); - for (size_t i = 0; i < end; i += 2) { - makeSiblingMC(neighbors[sortRecs[i].m_idx], neighbors[sortRecs[i + 1].m_idx]); - } - } - } - - // SELF TESTS - - // This is a performance test, its intent is to demonstrate that the - // partitioner doesn't run on this chain in N^2 time or worse. Overall - // runtime should be N*log(N) for a chain-shaped graph. - // - static void selfTestChain() { - const uint64_t usecsSmall = partitionChainUsecs(5); - const uint64_t usecsLarge = partitionChainUsecs(500); - // Large input is 50x bigger than small input. - // Its runtime should be about 10x longer -- not about 2500x longer - // or worse which would suggest N^2 scaling or worse. - UASSERT(usecsLarge < (usecsSmall * 1500), - "selfTestChain() took longer than expected. Small input runtime = " - << usecsSmall << ", large input runtime = " << usecsLarge); - } - - static uint64_t partitionChainUsecs(unsigned chain_len) { - // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. - const uint64_t startUsecs = V3Os::timeUsecs(); - V3Graph mtasks; - LogicMTask* lastp = nullptr; - for (unsigned i = 0; i < chain_len; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; - mtp->setCost(1); - if (lastp) new MTaskEdge{&mtasks, lastp, mtp, 1}; - lastp = mtp; - } - partInitCriticalPaths(&mtasks); - - // Since slowAsserts mode is *expected* to cause N^2 runtime, and the - // intent of this test is to demonstrate better-than-N^2 runtime, disable - // slowAsserts. - PartContraction ec{&mtasks, - // Any CP limit >chain_len should work: - chain_len * 2, nullptr, nullptr, false /* slowAsserts */}; - ec.go(); - - // All vertices should merge into one - UASSERT_SELFTEST( - bool, mtasks.verticesBeginp() && !mtasks.verticesBeginp()->verticesNextp(), true); - - const uint64_t endUsecs = V3Os::timeUsecs(); - const uint64_t elapsedUsecs = endUsecs - startUsecs; - - return elapsedUsecs; - } - - // This test defends against a particular failure mode that the - // partitioner exhibited during development: - // - // At one time, the partitioner consistently favored edge-merges over - // equal-scoring sibling merges. Every edge and sibling merge in this - // test starts out with an equal score. If you only do edge-merges, all - // possible merges will continue to have equal score as the center node - // grows and grows. Soon the critical path budget is exhausted by a - // large center node, and we still have many small leaf nodes -- it's - // literally the worst partition possible. - // - // Now, instead, the partitioner gives slight favoritism to sibling - // merges in the event that scores are tied. This is better for the - // test and also real designs. - static void selfTestX() { - // NOTE: To get a dot file run with --debugi-V3Partition 4 or more. - V3Graph mtasks; - LogicMTask* const centerp = new LogicMTask{&mtasks, nullptr}; - centerp->setCost(1); - unsigned i; - for (i = 0; i < 50; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; - mtp->setCost(1); - // Edge from every input -> centerp - new MTaskEdge{&mtasks, mtp, centerp, 1}; - } - for (i = 0; i < 50; ++i) { - LogicMTask* const mtp = new LogicMTask{&mtasks, nullptr}; - mtp->setCost(1); - // Edge from centerp -> every output - new MTaskEdge{&mtasks, centerp, mtp, 1}; - } - - partInitCriticalPaths(&mtasks); - PartContraction{&mtasks, 20, nullptr, nullptr, true}.go(); - - const auto report = mtasks.parallelismReport( - [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); - - // Checking exact values here is maybe overly precise. What we're - // mostly looking for is a healthy reduction in the number of mtasks. - UASSERT_SELFTEST(uint32_t, report.criticalPathCost(), 19); - UASSERT_SELFTEST(uint32_t, report.totalGraphCost(), 101); - UASSERT_SELFTEST(uint32_t, report.vertexCount(), 14); - UASSERT_SELFTEST(uint32_t, report.edgeCount(), 13); - } - -public: - static void selfTest() { - selfTestX(); - selfTestChain(); - } - -private: - VL_UNCOPYABLE(PartContraction); -}; - -//###################################################################### -// DpiImportCallVisitor - -// Scan node, indicate whether it contains a call to a DPI imported -// routine. -class DpiImportCallVisitor final : public VNVisitor { - bool m_hasDpiHazard = false; // Found a DPI import call. - bool m_tracingCall = false; // Iterating into a CCall to a CFunc - // METHODS - void visit(AstCFunc* nodep) override { - if (!m_tracingCall) return; - m_tracingCall = false; - if (nodep->dpiImportWrapper()) { - if (nodep->dpiPure() ? !v3Global.opt.threadsDpiPure() - : !v3Global.opt.threadsDpiUnpure()) { - m_hasDpiHazard = true; - } - } - iterateChildren(nodep); - } - void visit(AstNodeCCall* nodep) override { - iterateChildren(nodep); - // Enter the function and trace it - m_tracingCall = true; - iterate(nodep->funcp()); - } - void visit(AstNode* nodep) override { iterateChildren(nodep); } - -public: - // CONSTRUCTORS - explicit DpiImportCallVisitor(AstNode* nodep) { iterate(nodep); } - bool hasDpiHazard() const { return m_hasDpiHazard; } - ~DpiImportCallVisitor() override = default; - -private: - VL_UNCOPYABLE(DpiImportCallVisitor); -}; - -//###################################################################### -// PartFixDataHazards - -// Fix data hazards in the partition graph. -// -// The fine-grained graph from V3Order may contain data hazards which are -// not a problem for serial mode, but which would be a problem in parallel -// mode. -// -// There are basically two classes: unordered pairs of writes, and -// unordered write-read pairs. We fix both here, with a combination of -// MTask-merges and new edges to ensure no such unordered pairs remain. -// -// ABOUT UNORDERED WRITE-WRITE PAIRS -// -// The V3Order dependency graph treats these as unordered events: -// -// a) sig[15:8] = stuff; -// ... -// b) sig[7:0] = other_stuff; -// -// Seems OK right? They are writes to disjoint bits of the same -// signal. They can run in either order, in serial mode, and the result -// will be the same. -// -// The resulting C code for each of this isn't a pure write, it's -// actually an R-M-W sequence: -// -// a) sig = (sig & 0xff) | (0xff00 & (stuff << 8)); -// ... -// b) sig = (sig & 0xff00) | (0xff & other_stuff); -// -// In serial mode, order doesn't matter so long as these run serially. -// In parallel mode, we must serialize these RMW's to avoid a race. -// -// We don't actually check here if each write would involve an R-M-W, we -// just assume that it would. If this routine ever causes a drastic -// increase in critical path, it could be optimized to make a better -// prediction (with all the risk that word implies!) about whether a -// given write is likely to turn into an R-M-W. -// -// ABOUT UNORDERED WRITE-READ PAIRS -// -// If we don't put unordered write-read pairs into some order at Verilation -// time, we risk a runtime race. -// -// How do such unordered writer/reader pairs happen? Here's a partial list -// of scenarios: -// -// Case 1: Circular logic -// -// If the design has circular logic, V3Order has by now generated some -// dependency cycles, and also cut some of the edges to make it -// acyclic. -// -// For serial mode, that was fine. We can break logic circles at an -// arbitrary point. At runtime, we'll repeat the _eval() until no -// changes are detected, which papers over the discarded dependency. -// -// For parallel mode, this situation can lead to unordered reads and -// writes of the same variable, causing a data race. For example if the -// original code is this: -// -// assign b = b | a << 2; -// assign out = b; -// -// ... there's originally a dependency edge which records that 'b' -// depends on the first assign. V3Order may cut this edge, making the -// statements unordered. In serial mode that's fine, they can run in -// either order. In parallel mode it's a reader/writer race. -// -// Case 2: Race Condition in Verilog Sources -// -// If the input has races, eg. blocking assignments in always blocks -// that share variables, the graph at this point will contain unordered -// writes and reads (or unordered write-write pairs) reflecting that. -// -// Case 3: Interesting V3Order Behavior -// -// There's code in V3Order that explicitly avoids making a dependency -// edge from a clock-gater signal to the logic node that produces the -// clock signal. This leads to unordered reader/writer pairs in -// parallel mode. -// -class PartFixDataHazards final { - // TYPES - using TasksByRank = std::map>; - - // MEMBERS - const OrderGraph* const m_orderGraphp; // The OrderGraph - V3Graph* const m_mtasksp; // Mtask graph -public: - // CONSTRUCTORs - explicit PartFixDataHazards(const OrderGraph* orderGraphp, V3Graph* mtasksp) - : m_orderGraphp{orderGraphp} - , m_mtasksp{mtasksp} {} - // METHODS -private: - void findAdjacentTasks(const OrderVarStdVertex* varVtxp, TasksByRank& tasksByRank) { - // Find all writer tasks for this variable, group by rank. - for (V3GraphEdge* edgep = varVtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { - if (const auto* const logicVtxp = edgep->fromp()->cast()) { - LogicMTask* const writerMtaskp = static_cast(logicVtxp->userp()); - tasksByRank[writerMtaskp->rank()].insert(writerMtaskp); - } - } - // Not: Find all reader tasks for this variable, group by rank. - // There was "broken" code here to find readers, but fixing it to - // work properly harmed performance on some tests, see issue #3360. - } - void mergeSameRankTasks(const TasksByRank& tasksByRank) { - LogicMTask* lastRecipientp = nullptr; - for (const auto& pair : tasksByRank) { - // Find the largest node at this rank, merge into it. (If we - // happen to find a huge node, this saves time in - // partRedirectEdgesFrom() versus merging into an arbitrary node.) - LogicMTask* recipientp = nullptr; - for (LogicMTask* const mtaskp : pair.second) { - if (!recipientp || (recipientp->cost() < mtaskp->cost())) recipientp = mtaskp; - } - UASSERT_OBJ(!lastRecipientp || (lastRecipientp->rank() < recipientp->rank()), - recipientp, "Merging must be on lower rank"); - - for (LogicMTask* const donorp : pair.second) { - // Merge donor into recipient. - if (donorp == recipientp) continue; - // Fix up the map, so donor's OLVs map to recipientp - for (const MTaskMoveVertex* const tmvp : *(donorp->vertexListp())) { - tmvp->logicp()->userp(recipientp); - } - // Move all vertices from donorp to recipientp - recipientp->moveAllVerticesFrom(donorp); - // Redirect edges from donorp to recipientp, delete donorp - partRedirectEdgesFrom(m_mtasksp, recipientp, donorp, nullptr); - } - - if (lastRecipientp && !lastRecipientp->hasRelativeMTask(recipientp)) { - new MTaskEdge{m_mtasksp, lastRecipientp, recipientp, 1}; - } - lastRecipientp = recipientp; - } - } - bool hasDpiHazard(LogicMTask* mtaskp) { - for (const MTaskMoveVertex* const moveVtxp : *(mtaskp->vertexListp())) { - if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) { - // NOTE: We don't handle DPI exports. If testbench code calls a - // DPI-exported function at any time during eval() we may have - // a data hazard. (Likewise in non-threaded mode if an export - // messes with an ordered variable we're broken.) - - // Find all calls to DPI-imported functions, we can put those - // into a serial order at least. That should solve the most - // likely DPI-related data hazards. - if (DpiImportCallVisitor{lvtxp->nodep()}.hasDpiHazard()) return true; - } - } - return false; - } - -public: - void go() { - // Rank the graph. DGS is faster than V3GraphAlg's recursive rank, and also allows us to - // set up the OrderLogicVertex -> LogicMTask map at the same time. - { - GraphStreamUnordered serialize{m_mtasksp}; - while (LogicMTask* const mtaskp - = const_cast(static_cast(serialize.nextp()))) { - // Compute and assign rank - uint32_t rank = 0; - for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) { - rank = std::max(edgep->fromp()->rank() + 1, rank); - } - mtaskp->rank(rank); - - // Set up the OrderLogicVertex -> LogicMTask map - // Entry and exit MTasks have no MTaskMoveVertices under them, so move on - if (mtaskp->vertexListp()->empty()) continue; - // Otherwise there should be only one MTaskMoveVertex in each MTask at this stage - UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, - "Multiple MTaskMoveVertex"); - const MTaskMoveVertex* const moveVtxp = mtaskp->vertexListp()->front(); - // Set up mapping back to the MTask from the OrderLogicVertex - if (OrderLogicVertex* const lvtxp = moveVtxp->logicp()) lvtxp->userp(mtaskp); - } - } - - // Gather all variables. SystemC vars will be handled slightly specially, so keep separate. - std::vector regularVars; - std::vector systemCVars; - for (V3GraphVertex *vtxp = m_orderGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - // Only consider OrderVarStdVertex which reflects - // an actual lvalue assignment; the others do not. - if (const OrderVarStdVertex* const vvtxp = vtxp->cast()) { - if (vvtxp->vscp()->varp()->isSc()) { - systemCVars.push_back(vvtxp); - } else { - regularVars.push_back(vvtxp); - } - } - } - - // For each OrderVarVertex, look at its writer and reader mtasks. - // - // If there's a set of writers and readers at the same rank, we - // know these are unordered with respect to one another, so merge - // those mtasks all together. - // - // At this point, we have at most one merged mtask per rank (for a - // given OVV.) Create edges across these remaining mtasks to ensure - // they run in serial order (going along with the existing ranks.) - // - // NOTE: we don't update the CP's stored in the LogicMTasks to - // reflect the changes we make to the graph. That's OK, as we - // haven't yet initialized CPs when we call this routine. - for (const OrderVarStdVertex* const varVtxp : regularVars) { - // Build a set of mtasks, per rank, which access this var. - // Within a rank, sort by MTaskID to avoid nondeterminism. - TasksByRank tasksByRank; - - // Find all reader and writer tasks for this variable, add to - // tasksByRank. - findAdjacentTasks(varVtxp, tasksByRank); - - // Merge all writer and reader tasks from same rank together. - // - // NOTE: Strictly speaking, we don't need to merge all the - // readers together. That may lead to extra serialization. The - // least amount of ordering we could impose here would be to - // merge all writers at a given rank together; then make edges - // from the merged writer node to each reader node at the same - // rank; and then from each reader node to the merged writer at - // the next rank. - // - // Whereas, merging all readers and writers at the same rank - // together is "the simplest thing that could possibly work" - // and it seems to. It also creates fairly few edges. We don't - // want to create tons of edges here, doing so is not nice to - // the main edge contraction pass. - mergeSameRankTasks(tasksByRank); - } - - // Handle SystemC vars just a little differently. Instead of - // treating each var as an independent entity, and serializing - // writes to that one var, we treat ALL systemC vars as a single - // entity and serialize writes (and, conservatively, reads) across - // all of them. - // - // Reasoning: writing a systemC var actually turns into a call to a - // var.write() method, which under the hood is accessing some data - // structure that's shared by many SC vars. It's not thread safe. - // - // Hopefully we only have a few SC vars -- top level ports, probably. - { - TasksByRank tasksByRank; - for (const OrderVarStdVertex* const varVtxp : systemCVars) { - findAdjacentTasks(varVtxp, tasksByRank); - } - mergeSameRankTasks(tasksByRank); - } - - // Handle nodes containing DPI calls, we want to serialize those - // by default unless user gave --threads-dpi-concurrent. - // Same basic strategy as above to serialize access to SC vars. - if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) { - TasksByRank tasksByRank; - for (V3GraphVertex *vtxp = m_mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - LogicMTask* const mtaskp = static_cast(vtxp); - if (hasDpiHazard(mtaskp)) tasksByRank[mtaskp->rank()].insert(mtaskp); - } - mergeSameRankTasks(tasksByRank); - } - } - -private: - VL_UNCOPYABLE(PartFixDataHazards); -}; - -//###################################################################### -// ThreadSchedule - -class PartPackMTasks; - -// The thread schedule, containing all information needed later. Note that this is a simple -// aggregate data type and the only way to get hold of an instance of it is via -// PartPackMTasks::pack, which is moved from there and is const, which means we can only acquire a -// const reference to is so no further modifications are allowed, so all members are public -// (attributes). -class ThreadSchedule final { -public: - // CONSTANTS - static constexpr uint32_t UNASSIGNED = 0xffffffff; - - // TYPES - struct MTaskState final { - uint32_t completionTime = 0; // Estimated time this mtask will complete - uint32_t threadId = UNASSIGNED; // Thread id this MTask is assigned to - const ExecMTask* nextp = nullptr; // Next MTask on same thread after this - }; - - // MEMBERS - // Allocation of sequence of MTasks to threads. Can be considered a map from thread ID to - // the sequence of MTasks to be executed by that thread. - std::vector> threads; - - // State for each mtask. - std::unordered_map mtaskState; - - uint32_t threadId(const ExecMTask* mtaskp) const { - const auto& it = mtaskState.find(mtaskp); - if (it != mtaskState.end()) { - return it->second.threadId; - } else { - return UNASSIGNED; - } - } - -private: - friend class PartPackMTasks; - - explicit ThreadSchedule(uint32_t nThreads) - : threads{nThreads} {} - VL_UNCOPYABLE(ThreadSchedule); // But movable - ThreadSchedule(ThreadSchedule&&) = default; - ThreadSchedule& operator=(ThreadSchedule&&) = default; - - // Debugging - void dumpDotFile(const V3Graph& graph, const string& filename) const; - void dumpDotFilePrefixedAlways(const V3Graph& graph, const string& nameComment) const; - -public: - // Returns the number of cross-thread dependencies of the given MTask. If > 0, the MTask must - // test whether its dependencies are ready before starting, and therefore may need to block. - uint32_t crossThreadDependencies(const ExecMTask* mtaskp) const { - const uint32_t thisThreadId = threadId(mtaskp); - uint32_t result = 0; - for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) { - const ExecMTask* const prevp = edgep->fromp()->as(); - if (threadId(prevp) != thisThreadId) ++result; - } - return result; - } - - uint32_t startTime(const ExecMTask* mtaskp) const { - return mtaskState.at(mtaskp).completionTime - mtaskp->cost(); - } - uint32_t endTime(const ExecMTask* mtaskp) const { - return mtaskState.at(mtaskp).completionTime; - } -}; - -//! Variant of dumpDotFilePrefixed without --dump option check -void ThreadSchedule::dumpDotFilePrefixedAlways(const V3Graph& graph, - const string& nameComment) const { - dumpDotFile(graph, v3Global.debugFilename(nameComment) + ".dot"); -} - -void ThreadSchedule::dumpDotFile(const V3Graph& graph, const string& filename) const { - // This generates a file used by graphviz, https://www.graphviz.org - const std::unique_ptr logp{V3File::new_ofstream(filename)}; - if (logp->fail()) v3fatal("Can't write " << filename); - - // Header - *logp << "digraph v3graph {\n"; - *logp << " graph[layout=\"neato\" labelloc=t labeljust=l label=\"" << filename << "\"]\n"; - *logp << " node[shape=\"rect\" ratio=\"fill\" fixedsize=true]\n"; - - // Thread labels - *logp << "\n // Threads\n"; - const int threadBoxWidth = 2; - for (int i = 0; i < v3Global.opt.threads(); i++) { - *logp << " t" << i << " [label=\"Thread " << i << "\" width=" << threadBoxWidth - << " pos=\"" << (-threadBoxWidth / 2) << "," << -i - << "!\" style=\"filled\" fillcolor=\"grey\"] \n"; - } - - // MTask nodes - *logp << "\n // MTasks\n"; - - // Find minimum cost MTask for scaling MTask node widths - uint32_t minCost = UINT32_MAX; - for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - if (const ExecMTask* const mtaskp = vxp->cast()) { - minCost = minCost > mtaskp->cost() ? mtaskp->cost() : minCost; - } - } - const double minWidth = 2.0; - const auto mtaskXPos = [&](const ExecMTask* mtaskp, const double nodeWidth) { - const double startPosX = (minWidth * startTime(mtaskp)) / minCost; - return nodeWidth / minWidth + startPosX; - }; - - const auto emitMTask = [&](const ExecMTask* mtaskp) { - const int thread = threadId(mtaskp); - const double nodeWidth = minWidth * (static_cast(mtaskp->cost()) / minCost); - const double x = mtaskXPos(mtaskp, nodeWidth); - const int y = -thread; - const string label = "label=\"" + mtaskp->name() + " (" + cvtToStr(startTime(mtaskp)) + ":" - + std::to_string(endTime(mtaskp)) + ")" + "\""; - *logp << " " << mtaskp->name() << " [" << label << " width=" << nodeWidth << " pos=\"" - << x << "," << y << "!\"]\n"; - }; - - // Emit MTasks - for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - if (const ExecMTask* const mtaskp = vxp->cast()) emitMTask(mtaskp); - } - - // Emit MTask dependency edges - *logp << "\n // MTask dependencies\n"; - for (const V3GraphVertex* vxp = graph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - if (const ExecMTask* const mtaskp = vxp->cast()) { - for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const V3GraphVertex* const top = edgep->top(); - *logp << " " << vxp->name() << " -> " << top->name() << "\n"; - } - } - } - - // Trailer - *logp << "}\n"; - logp->close(); -} - -//###################################################################### -// PartPackMTasks - -// Statically pack tasks into threads. -// -// The simplest thing that could possibly work would be to assume that our -// predictions of task runtimes are precise, and that every thread will -// make progress at an equal rate. Simulate a single "clock", pack the the -// highest priority ready task into whatever thread becomes ready earliest, -// repeating until no tasks remain. -// -// That doesn't work well, as our predictions of task runtimes have wide -// error bars (+/- 60% is typical.) -// -// So be a little more clever: let each task have a different end time, -// depending on which thread is looking. Be a little bit pessimistic when -// thread A checks the end time of an mtask running on thread B. This extra -// "padding" avoids tight "layovers" at cross-thread dependencies. -class PartPackMTasks final { - // TYPES - struct MTaskCmp final { - bool operator()(const ExecMTask* ap, const ExecMTask* bp) const { - return ap->id() < bp->id(); - } - }; - - // MEMBERS - const uint32_t m_nThreads; // Number of threads - const uint32_t m_sandbagNumerator; // Numerator padding for est runtime - const uint32_t m_sandbagDenom; // Denominator padding for est runtime - -public: - // CONSTRUCTORS - explicit PartPackMTasks(uint32_t nThreads = v3Global.opt.threads(), - unsigned sandbagNumerator = 30, unsigned sandbagDenom = 100) - : m_nThreads{nThreads} - , m_sandbagNumerator{sandbagNumerator} - , m_sandbagDenom{sandbagDenom} {} - ~PartPackMTasks() = default; - -private: - // METHODS - uint32_t completionTime(const ThreadSchedule& schedule, const ExecMTask* mtaskp, - uint32_t threadId) { - const ThreadSchedule::MTaskState& state = schedule.mtaskState.at(mtaskp); - UASSERT(state.threadId != ThreadSchedule::UNASSIGNED, "Mtask should have assigned thread"); - if (threadId == state.threadId) { - // No overhead on same thread - return state.completionTime; - } - - // Add some padding to the estimated runtime when looking from - // another thread - uint32_t sandbaggedEndTime - = state.completionTime + (m_sandbagNumerator * mtaskp->cost()) / m_sandbagDenom; - - // If task B is packed after task A on thread 0, don't let thread 1 - // think that A finishes earlier than thread 0 thinks that B - // finishes, otherwise we get priority inversions and fail the self - // test. - if (state.nextp) { - const uint32_t successorEndTime - = completionTime(schedule, state.nextp, state.threadId); - if ((sandbaggedEndTime >= successorEndTime) && (successorEndTime > 1)) { - sandbaggedEndTime = successorEndTime - 1; - } - } - - UINFO(6, "Sandbagged end time for " << mtaskp->name() << " on th " << threadId << " = " - << sandbaggedEndTime << endl); - return sandbaggedEndTime; - } - - bool isReady(ThreadSchedule& schedule, const ExecMTask* mtaskp) { - for (V3GraphEdge* edgeInp = mtaskp->inBeginp(); edgeInp; edgeInp = edgeInp->inNextp()) { - const ExecMTask* const prevp = edgeInp->fromp()->as(); - if (schedule.threadId(prevp) == ThreadSchedule::UNASSIGNED) { - // This predecessor is not assigned yet - return false; - } - } - return true; - } - -public: - // Pack an MTasks from given graph into m_nThreads threads, return the schedule. - const ThreadSchedule pack(const V3Graph& mtaskGraph) { - // The result - ThreadSchedule schedule{m_nThreads}; - - // Time each thread is occupied until - std::vector busyUntil(m_nThreads, 0); - - // MTasks ready to be assigned next. All their dependencies are already assigned. - std::set readyMTasks; - - // Build initial ready list - for (V3GraphVertex* vxp = mtaskGraph.verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - ExecMTask* const mtaskp = vxp->as(); - if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp); - } - - while (!readyMTasks.empty()) { - // For each task in the ready set, compute when it might start - // on each thread (in that thread's local time frame.) - uint32_t bestTime = 0xffffffff; - uint32_t bestThreadId = 0; - ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask* - for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) { - for (ExecMTask* const mtaskp : readyMTasks) { - uint32_t timeBegin = busyUntil[threadId]; - if (timeBegin > bestTime) { - UINFO(6, "th " << threadId << " busy until " << timeBegin - << ", later than bestTime " << bestTime - << ", skipping thread.\n"); - break; - } - for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; - edgep = edgep->inNextp()) { - const ExecMTask* const priorp = edgep->fromp()->as(); - const uint32_t priorEndTime = completionTime(schedule, priorp, threadId); - if (priorEndTime > timeBegin) timeBegin = priorEndTime; - } - UINFO(6, "Task " << mtaskp->name() << " start at " << timeBegin - << " on thread " << threadId << endl); - if ((timeBegin < bestTime) - || ((timeBegin == bestTime) - && bestMtaskp // Redundant, but appeases static analysis tools - && (mtaskp->priority() > bestMtaskp->priority()))) { - bestTime = timeBegin; - bestThreadId = threadId; - bestMtaskp = mtaskp; - } - } - } - - UASSERT(bestMtaskp, "Should have found some task"); - UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << bestThreadId - << endl); - - // Reference to thread in schedule we are assigning this MTask to. - std::vector& bestThread = schedule.threads[bestThreadId]; - - // Update algorithm state - bestMtaskp->predictStart(bestTime); // Only for gantt reporting - const uint32_t bestEndTime = bestTime + bestMtaskp->cost(); - schedule.mtaskState[bestMtaskp].completionTime = bestEndTime; - schedule.mtaskState[bestMtaskp].threadId = bestThreadId; - if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; - busyUntil[bestThreadId] = bestEndTime; - - // Add the MTask to the schedule - bestThread.push_back(bestMtaskp); - - // Update the ready list - const size_t erased = readyMTasks.erase(bestMtaskp); - UASSERT_OBJ(erased > 0, bestMtaskp, "Should have erased something?"); - for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp(); edgeOutp; - edgeOutp = edgeOutp->outNextp()) { - ExecMTask* const nextp = edgeOutp->top()->as(); - // Dependent MTask should not yet be assigned to a thread - UASSERT(schedule.threadId(nextp) == ThreadSchedule::UNASSIGNED, - "Tasks after one being assigned should not be assigned yet"); - // Dependent MTask should not be ready yet, since dependency is just being assigned - UASSERT_OBJ(readyMTasks.find(nextp) == readyMTasks.end(), nextp, - "Tasks after one being assigned should not be ready"); - if (isReady(schedule, nextp)) { - readyMTasks.insert(nextp); - UINFO(6, "Inserted " << nextp->name() << " into ready\n"); - } - } - } - - if (dumpGraphLevel() >= 4) schedule.dumpDotFilePrefixedAlways(mtaskGraph, "schedule"); - - return schedule; - } - - // SELF TEST - static void selfTest() { - V3Graph graph; - ExecMTask* const t0 = new ExecMTask{&graph, nullptr, 0}; - t0->cost(1000); - t0->priority(1100); - ExecMTask* const t1 = new ExecMTask{&graph, nullptr, 1}; - t1->cost(100); - t1->priority(100); - ExecMTask* const t2 = new ExecMTask{&graph, nullptr, 2}; - t2->cost(100); - t2->priority(100); - - new V3GraphEdge{&graph, t0, t1, 1}; - new V3GraphEdge{&graph, t0, t2, 1}; - - PartPackMTasks packer{2, // Threads - 3, // Sandbag numerator - 10}; // Sandbag denom - const ThreadSchedule& schedule = packer.pack(graph); - - UASSERT_SELFTEST(size_t, schedule.threads.size(), 2); - - UASSERT_SELFTEST(size_t, schedule.threads[0].size(), 2); - UASSERT_SELFTEST(size_t, schedule.threads[1].size(), 1); - - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][0], t0); - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][1], t1); - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[1][0], t2); - - UASSERT_SELFTEST(size_t, schedule.mtaskState.size(), 3); - - UASSERT_SELFTEST(uint32_t, schedule.threadId(t0), 0); - UASSERT_SELFTEST(uint32_t, schedule.threadId(t1), 0); - UASSERT_SELFTEST(uint32_t, schedule.threadId(t2), 1); - - // On its native thread, we see the actual end time for t0: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 0), 1000); - // On the other thread, we see a sandbagged end time which does not - // exceed the t1 end time: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 1), 1099); - - // Actual end time on native thread: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 0), 1100); - // Sandbagged end time seen on thread 1. Note it does not compound - // with t0's sandbagged time; compounding caused trouble in - // practice. - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 1), 1130); - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 0), 1229); - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 1), 1199); - } - -private: - VL_UNCOPYABLE(PartPackMTasks); -}; - -//###################################################################### -// V3Partition implementation - -void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) { - if (!debug() && !dumpLevel() && !dumpGraphLevel()) return; - - UINFO(4, "\n"); - UINFO(4, " Stats for " << stage << endl); - uint32_t mtaskCount = 0; - uint32_t totalCost = 0; - std::array mtaskCostHist; - mtaskCostHist.fill(0); - - for (const V3GraphVertex* mtaskp = graphp->verticesBeginp(); mtaskp; - mtaskp = mtaskp->verticesNextp()) { - ++mtaskCount; - uint32_t mtaskCost = mtaskp->as()->cost(); - totalCost += mtaskCost; - - unsigned log2Cost = 0; - while (mtaskCost >>= 1) ++log2Cost; - UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats"); - ++mtaskCostHist[log2Cost]; - } - UINFO(4, " Total mtask cost = " << totalCost << "\n"); - UINFO(4, " Mtask count = " << mtaskCount << "\n"); - UINFO(4, " Avg cost / mtask = " - << ((mtaskCount > 0) ? cvtToStr(totalCost / mtaskCount) : "INF!") << "\n"); - UINFO(4, " Histogram of mtask costs:\n"); - for (unsigned i = 0; i < 32; ++i) { - if (mtaskCostHist[i]) { - UINFO(4, " 2^" << i << ": " << mtaskCostHist[i] << endl); - V3Stats::addStat("MTask graph, " + stage + ", mtask cost 2^" + (i < 10 ? " " : "") - + cvtToStr(i), - mtaskCostHist[i]); - } - } - - if (mtaskCount < 1000) { - string filePrefix("ordermv_"); - filePrefix += stage; - if (dumpGraphLevel() >= 4) graphp->dumpDotFilePrefixedAlways(filePrefix); - } - - // Look only at the cost of each mtask, neglect communication cost. - // This will show us how much parallelism we expect, assuming cache-miss - // costs are minor and the cost of running logic is the dominant cost. - const auto report = graphp->parallelismReport( - [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); - V3Stats::addStat("MTask graph, " + stage + ", critical path cost", report.criticalPathCost()); - V3Stats::addStat("MTask graph, " + stage + ", total graph cost", report.totalGraphCost()); - V3Stats::addStat("MTask graph, " + stage + ", mtask count", report.vertexCount()); - V3Stats::addStat("MTask graph, " + stage + ", edge count", report.edgeCount()); - V3Stats::addStat("MTask graph, " + stage + ", parallelism factor", report.parallelismFactor()); - if (debug() >= 4) { - UINFO(0, "\n"); - UINFO(0, " MTask Parallelism estimate based costs at stage" << stage << ":\n"); - UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n"); - UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n"); - UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n"); - UINFO(0, " Edge count = " << report.edgeCount() << "\n"); - UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n"); - } -} - -// Print a hash of the shape of graphp. If you are battling -// nondeterminism, this can help to pinpoint where in the pipeline it's -// creeping in. -void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) { - // Disabled when there are no nondeterminism issues in flight. - if (!v3Global.opt.debugNondeterminism()) return; - - std::unordered_map vx2Id; - unsigned id = 0; - for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - vx2Id[vxp] = id++; - } - unsigned hash = 0; - for (const V3GraphVertex* vxp = graphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - for (const V3GraphEdge* edgep = vxp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const V3GraphVertex* const top = edgep->top(); - hash = vx2Id[top] + 31U * hash; // The K&R hash function - } - } - UINFO(0, "Hash of shape (not contents) of " << debugName << " = " << cvtToStr(hash) << endl); -} - -// Predicate function to determine what MTaskMoveVertex to bypass when constructing the MTask -// graph. The fine-grained dependency graph of MTaskMoveVertex vertices is a bipartite graph of: -// - 1. MTaskMoveVertex instances containing logic via OrderLogicVertex -// (MTaskMoveVertex::logicp() != nullptr) -// - 2. MTaskMoveVertex instances containing an (OrderVarVertex, domain) pair -// Our goal is to order the logic vertices. The second type of variable/domain vertices only carry -// dependencies and are eventually discarded. In order to reduce the working set size of -// PartContraction, we 'bypass' and not create LogicMTask vertices for the variable vertices, and -// instead add the transitive dependencies directly, but only if adding the transitive edges -// directly does not require more dependency edges than keeping the intermediate vertex. That is, -// we bypass a variable vertex if fanIn * fanOut <= fanIn + fanOut. This can only be true if fanIn -// or fanOut are 1, or if they are both 2. This can cause significant reduction in working set -// size. -static bool bypassOk(MTaskMoveVertex* mvtxp) { - // Need to keep all logic vertices - if (mvtxp->logicp()) return false; - // Count fan-in, up to 3 - unsigned fanIn = 0; - for (V3GraphEdge* edgep = mvtxp->inBeginp(); edgep; edgep = edgep->inNextp()) { - if (++fanIn == 3) break; - } - UDEBUGONLY(UASSERT_OBJ(fanIn <= 3, mvtxp, "Should have stopped counting fanIn");); - // If fanInn no more than one, bypass - if (fanIn <= 1) return true; - // Count fan-out, up to 3 - unsigned fanOut = 0; - for (V3GraphEdge* edgep = mvtxp->outBeginp(); edgep; edgep = edgep->outNextp()) { - if (++fanOut == 3) break; - } - UDEBUGONLY(UASSERT_OBJ(fanOut <= 3, mvtxp, "Should have stopped counting fanOut");); - // If fan-out no more than one, bypass - if (fanOut <= 1) return true; - // They can only be (2, 2), (2, 3), (3, 2), (3, 3) at this point, bypass if (2, 2) - return fanIn + fanOut == 4; -} - -uint32_t V3Partition::setupMTaskDeps(V3Graph* mtasksp) { - uint32_t totalGraphCost = 0; - - // Artificial single entry point vertex in the MTask graph to allow sibling merges. - // This is required as otherwise disjoint sub-graphs could not be merged, but the - // coarsening algorithm assumes that the graph is connected. - m_entryMTaskp = new LogicMTask{mtasksp, nullptr}; - - // The V3InstrCount within LogicMTask will set user1 on each AST - // node, to assert that we never count any node twice. - const VNUser1InUse user1inUse; - - // Create the LogicMTasks for each MTaskMoveVertex - for (V3GraphVertex *vtxp = m_fineDepsGraphp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - MTaskMoveVertex* const mVtxp = static_cast(vtxp); - if (bypassOk(mVtxp)) { - mVtxp->userp(nullptr); // Set to nullptr to mark as bypassed - } else { - LogicMTask* const mtaskp = new LogicMTask{mtasksp, mVtxp}; - mVtxp->userp(mtaskp); - totalGraphCost += mtaskp->cost(); - } - } - - // Artificial single exit point vertex in the MTask graph to allow sibling merges. - // this enables merging MTasks with no downstream dependents if that is the ideal merge. - m_exitMTaskp = new LogicMTask{mtasksp, nullptr}; - - // Create the mtask->mtask dependency edges based on the dependencies between MTaskMoveVertex - // vertices. - for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - LogicMTask* const mtaskp = static_cast(vtxp); - - // Entry and exit vertices handled separately - if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; - - // At this point, there should only be one MTaskMoveVertex per LogicMTask - UASSERT_OBJ(mtaskp->vertexListp()->size() == 1, mtaskp, "Multiple MTaskMoveVertex"); - MTaskMoveVertex* const mvtxp = mtaskp->vertexListp()->front(); - UASSERT_OBJ(mvtxp->userp(), mtaskp, "Bypassed MTaskMoveVertex should not have MTask"); - - // Function to add a edge to a dependent from 'mtaskp' - const auto addEdge = [mtasksp, mtaskp](LogicMTask* otherp) { - UASSERT_OBJ(otherp != mtaskp, mtaskp, "Would create a cycle edge"); - if (mtaskp->hasRelativeMTask(otherp)) return; // Don't create redundant edges. - new MTaskEdge{mtasksp, mtaskp, otherp, 1}; - }; - - // Iterate downstream direct dependents - for (V3GraphEdge *dEdgep = mvtxp->outBeginp(), *dNextp; dEdgep; dEdgep = dNextp) { - dNextp = dEdgep->outNextp(); - V3GraphVertex* const top = dEdgep->top(); - if (LogicMTask* const otherp = static_cast(top->userp())) { - // The opposite end of the edge is not a bypassed vertex, add as direct dependent - addEdge(otherp); - } else { - // The opposite end of the edge is a bypassed vertex, add transitive dependents - for (V3GraphEdge *tEdgep = top->outBeginp(), *tNextp; tEdgep; tEdgep = tNextp) { - tNextp = tEdgep->outNextp(); - LogicMTask* const transp = static_cast(tEdgep->top()->userp()); - // The Move graph is bipartite (logic <-> var), and logic is never bypassed, - // hence 'transp' must be non nullptr. - UASSERT_OBJ(transp, mvtxp, "This cannot be a bypassed vertex"); - addEdge(transp); - } - } - } - } - - // Create Dependencies to/from the entry/exit vertices. - for (V3GraphVertex *vtxp = mtasksp->verticesBeginp(), *nextp; vtxp; vtxp = nextp) { - nextp = vtxp->verticesNextp(); - LogicMTask* const mtaskp = static_cast(vtxp); - - if (VL_UNLIKELY((mtaskp == m_entryMTaskp) || (mtaskp == m_exitMTaskp))) continue; - - // Add the entry/exit edges - if (mtaskp->inEmpty()) new MTaskEdge{mtasksp, m_entryMTaskp, mtaskp, 1}; - if (mtaskp->outEmpty()) new MTaskEdge{mtasksp, mtaskp, m_exitMTaskp, 1}; - } - - return totalGraphCost; -} - -void V3Partition::go(V3Graph* mtasksp) { - // Called by V3Order - hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps"); - - // Create the first MTasks. Initially, each MTask just wraps one - // MTaskMoveVertex. Over time, we'll merge MTasks together and - // eventually each MTask will wrap a large number of MTaskMoveVertices - // (and the logic nodes therein.) - const uint32_t totalGraphCost = setupMTaskDeps(mtasksp); - - V3Partition::debugMTaskGraphStats(mtasksp, "initial"); - - // For debug: print out the longest critical path. This allows us to - // verify that the costs look reasonable, that we aren't combining - // nodes that should probably be split, etc. - if (dumpLevel() >= 3) LogicMTask::dumpCpFilePrefixed(mtasksp, "cp"); - - // Merge nodes that could present data hazards; see comment within. - { - PartFixDataHazards{m_orderGraphp, mtasksp}.go(); - V3Partition::debugMTaskGraphStats(mtasksp, "hazards"); - hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()"); - } - - // Setup the critical path into and out of each node. - partInitCriticalPaths(mtasksp); - hashGraphDebug(mtasksp, "after partInitCriticalPaths()"); - - // Order the graph. We know it's already ranked from fixDataHazards() - // so we don't need to rank it again. - // - // On at least some models, ordering the graph here seems to help - // performance. (Why? Is it just triggering noise in a lucky direction? - // Is it just as likely to harm results?) - // - // More diversity of models that can build with --threads will - // eventually tell us. For now keep the order() so we don't forget - // about it, in case it actually helps. TODO: get more data and maybe - // remove this later if it doesn't really help. - mtasksp->orderPreRanked(); - - const int targetParFactor = v3Global.opt.threads(); - UASSERT(targetParFactor >= 2, "Should not reach V3Partition when --threads <= 1"); - - // Set cpLimit to roughly totalGraphCost / nThreads - // - // Actually set it a bit lower, by a hardcoded fudge factor. This - // results in more smaller mtasks, which helps reduce fragmentation - // when scheduling them. - const unsigned fudgeNumerator = 3; - const unsigned fudgeDenominator = 5; - const uint32_t cpLimit - = ((totalGraphCost * fudgeNumerator) / (targetParFactor * fudgeDenominator)); - UINFO(4, "V3Partition set cpLimit = " << cpLimit << endl); - - // Merge MTask nodes together, repeatedly, until the CP budget is - // reached. Coarsens the graph, usually by several orders of - // magnitude. - // - // Some tests disable this, hence the test on threadsCoarsen(). - // Coarsening is always enabled in production. - if (v3Global.opt.threadsCoarsen()) { - PartContraction{mtasksp, cpLimit, m_entryMTaskp, m_exitMTaskp, - // --debugPartition is used by tests - // to enable slow assertions. - v3Global.opt.debugPartition()} - .go(); - V3Partition::debugMTaskGraphStats(mtasksp, "contraction"); - } - { - mtasksp->removeTransitiveEdges(); - V3Partition::debugMTaskGraphStats(mtasksp, "transitive1"); - } - - // Reassign MTask IDs onto smaller numbers, which should be more stable - // across small logic changes. Keep MTask IDs in the same relative - // order though, otherwise we break CmpLogicMTask for still-existing - // EdgeSet's that haven't destructed yet. - { - using SortedMTaskSet = std::set; - SortedMTaskSet sorted; - for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - LogicMTask* const mtaskp = static_cast(itp); - sorted.insert(mtaskp); - } - for (auto it = sorted.begin(); it != sorted.end(); ++it) { - // We shouldn't perturb the sort order of the set, despite - // changing the IDs, they should all just remain in the same - // relative order. Confirm that: - const uint32_t nextId = v3Global.rootp()->allocNextMTaskID(); - UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here"); - UINFO(4, "Reassigning MTask id " << (*it)->id() << " to id " << nextId << "\n"); - (*it)->id(nextId); - } - } - - // Set color to indicate an mtaskId on every underlying MTaskMoveVertex. - for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp; itp = itp->verticesNextp()) { - const LogicMTask* const mtaskp = static_cast(itp); - for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin(); - it != mtaskp->vertexListp()->end(); ++it) { - MTaskMoveVertex* const mvertexp = *it; - mvertexp->color(mtaskp->id()); - } - } -} - -void add(std::unordered_map& cmap, int id, uint64_t cost) { cmap[id] += cost; } - -using EstimateAndProfiled = std::pair; // cost est, cost profiled -using Costs = std::unordered_map; - -static void normalizeCosts(Costs& costs) { - const auto scaleCost = [](uint64_t value, double multiplier) { - double scaled = static_cast(value) * multiplier; - if (value && scaled < 1) scaled = 1; - return static_cast(scaled); - }; - - // For all costs with a profile, compute sum - uint64_t sumCostProfiled = 0; // For data with estimate and profile - uint64_t sumCostEstimate = 0; // For data with estimate and profile - for (const auto& est : costs) { - if (est.second.second) { - sumCostEstimate += est.second.first; - sumCostProfiled += est.second.second; - } - } - - if (sumCostEstimate) { - // For data where we don't have profiled data, compute how much to - // scale up/down the estimate to make on same relative scale as - // profiled data. (Improves results if only a few profiles missing.) - const double estToProfile - = static_cast(sumCostProfiled) / static_cast(sumCostEstimate); - UINFO(5, "Estimated data needs scaling by " - << estToProfile << ", sumCostProfiled=" << sumCostProfiled - << " sumCostEstimate=" << sumCostEstimate << endl); - for (auto& est : costs) { - uint64_t& costEstimate = est.second.first; - costEstimate = scaleCost(costEstimate, estToProfile); - } - } - - // COSTS can overflow a uint32. Using maximum value of costs, scale all down - uint64_t maxCost = 0; - for (auto& est : costs) { - const uint64_t& costEstimate = est.second.first; - const uint64_t& costProfiled = est.second.second; - if (maxCost < costEstimate) maxCost = costEstimate; - if (maxCost < costProfiled) maxCost = costProfiled; - UINFO(9, - "Post uint scale: ce = " << est.second.first << " cp=" << est.second.second << endl); - } - const uint64_t scaleDownTo = 10000000; // Extra room for future algorithms to add costs - if (maxCost > scaleDownTo) { - const double scaleup = static_cast(scaleDownTo) / static_cast(maxCost); - UINFO(5, "Scaling data to within 32-bits by multiply by=" << scaleup << ", maxCost=" - << maxCost << endl); - for (auto& est : costs) { - est.second.first = scaleCost(est.second.first, scaleup); - est.second.second = scaleCost(est.second.second, scaleup); - } - } -} - -void V3Partition::selfTestNormalizeCosts() { - { // Test that omitted profile data correctly scales estimates - Costs costs({// id est prof - {1, {10, 1000}}, - {2, {20, 0}}, // Note no profile - {3, {30, 3000}}}); - normalizeCosts(costs); - UASSERT_SELFTEST(uint64_t, costs[1].first, 1000); - UASSERT_SELFTEST(uint64_t, costs[1].second, 1000); - UASSERT_SELFTEST(uint64_t, costs[2].first, 2000); - UASSERT_SELFTEST(uint64_t, costs[2].second, 0); - UASSERT_SELFTEST(uint64_t, costs[3].first, 3000); - UASSERT_SELFTEST(uint64_t, costs[3].second, 3000); - } - { // Test that very large profile data properly scales - Costs costs({// id est prof - {1, {10, 100000000000}}, - {2, {20, 200000000000}}, - {3, {30, 1}}}); // Make sure doesn't underflow - normalizeCosts(costs); - UASSERT_SELFTEST(uint64_t, costs[1].first, 2500000); - UASSERT_SELFTEST(uint64_t, costs[1].second, 5000000); - UASSERT_SELFTEST(uint64_t, costs[2].first, 5000000); - UASSERT_SELFTEST(uint64_t, costs[2].second, 10000000); - UASSERT_SELFTEST(uint64_t, costs[3].first, 7500000); - UASSERT_SELFTEST(uint64_t, costs[3].second, 1); - } -} - -static void fillinCosts(V3Graph* execMTaskGraphp) { - V3UniqueNames m_uniqueNames; // For generating unique mtask profile hash names - - // Pass 1: See what profiling data applies - Costs costs; // For each mtask, costs - - for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; - vxp = vxp->verticesNextp()) { - ExecMTask* const mtp = const_cast(vxp)->as(); - // Compute name of mtask, for hash lookup - mtp->hashName(m_uniqueNames.get(mtp->bodyp())); - - // This estimate is 64 bits, but the final mtask graph algorithm needs 32 bits - const uint64_t costEstimate = V3InstrCount::count(mtp->bodyp(), false); - const uint64_t costProfiled - = V3Config::getProfileData(v3Global.opt.prefix(), mtp->hashName()); - if (costProfiled) { - UINFO(5, "Profile data for mtask " << mtp->id() << " " << mtp->hashName() - << " cost override " << costProfiled << endl); - } - costs[mtp->id()] = std::make_pair(costEstimate, costProfiled); - } - - normalizeCosts(costs /*ref*/); - - int totalEstimates = 0; - int missingProfiles = 0; - for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; - vxp = vxp->verticesNextp()) { - ExecMTask* const mtp = const_cast(vxp)->as(); - const uint32_t costEstimate = costs[mtp->id()].first; - const uint64_t costProfiled = costs[mtp->id()].second; - UINFO(9, "ce = " << costEstimate << " cp=" << costProfiled << endl); - UASSERT(costEstimate <= (1UL << 31), "cost scaling math would overflow uint32"); - UASSERT(costProfiled <= (1UL << 31), "cost scaling math would overflow uint32"); - const uint64_t costProfiled32 = static_cast(costProfiled); - uint32_t costToUse = costProfiled32; - if (!costProfiled32) { - costToUse = costEstimate; - if (costEstimate != 0) ++missingProfiles; - } - if (costEstimate != 0) ++totalEstimates; - mtp->cost(costToUse); - mtp->priority(costToUse); - } - - if (missingProfiles) { - if (FileLine* const fl = V3Config::getProfileDataFileLine()) { - fl->v3warn(PROFOUTOFDATE, "Profile data for mtasks may be out of date. " - << missingProfiles << " of " << totalEstimates - << " mtasks had no data"); - } - } -} - -static void finalizeCosts(V3Graph* execMTaskGraphp) { - GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE); - while (const V3GraphVertex* const vxp = ser.nextp()) { - ExecMTask* const mtp = const_cast(vxp)->as(); - // "Priority" is the critical path from the start of the mtask, to - // the end of the graph reachable from this mtask. Given the - // choice among several ready mtasks, we'll want to start the - // highest priority one first, so we're always working on the "long - // pole" - for (V3GraphEdge* edgep = mtp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const ExecMTask* const followp = edgep->top()->as(); - if ((followp->priority() + mtp->cost()) > mtp->priority()) { - mtp->priority(followp->priority() + mtp->cost()); - } - } - } - - // Some MTasks may now have zero cost, eliminate those. - // (It's common for tasks to shrink to nothing when V3LifePost - // removes dly assignments.) - for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;) { - ExecMTask* const mtp = vxp->as(); - vxp = vxp->verticesNextp(); // Advance before delete - - // Don't rely on checking mtp->cost() == 0 to detect an empty task. - // Our cost-estimating logic is just an estimate. Instead, check - // the MTaskBody to see if it's empty. That's the source of truth. - AstMTaskBody* const bodyp = mtp->bodyp(); - if (!bodyp->stmtsp()) { // Kill this empty mtask - UINFO(6, "Removing zero-cost " << mtp->name() << endl); - for (V3GraphEdge* inp = mtp->inBeginp(); inp; inp = inp->inNextp()) { - for (V3GraphEdge* outp = mtp->outBeginp(); outp; outp = outp->outNextp()) { - new V3GraphEdge{execMTaskGraphp, inp->fromp(), outp->top(), 1}; - } - } - VL_DO_DANGLING(mtp->unlinkDelete(execMTaskGraphp), mtp); - // Also remove and delete the AstMTaskBody, otherwise it would - // keep a dangling pointer to the ExecMTask. - VL_DO_DANGLING(bodyp->unlinkFrBack()->deleteTree(), bodyp); - } - } - - // Assign profiler IDs - for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; vxp = vxp->verticesNextp()) { - static_cast(vxp)->profilerId(v3Global.rootp()->allocNextMTaskProfilingID()); - } - - // Removing tasks may cause edges that were formerly non-transitive to - // become transitive. Also we just created new edges around the removed - // tasks, which could be transitive. Prune out all transitive edges. - { - execMTaskGraphp->removeTransitiveEdges(); - V3Partition::debugMTaskGraphStats(execMTaskGraphp, "transitive2"); - } - - // Record summary stats for final m_tasks graph. - const auto report = execMTaskGraphp->parallelismReport( - [](const V3GraphVertex* vtxp) { return vtxp->as()->cost(); }); - V3Stats::addStat("MTask graph, final, critical path cost", report.criticalPathCost()); - V3Stats::addStat("MTask graph, final, total graph cost", report.totalGraphCost()); - V3Stats::addStat("MTask graph, final, mtask count", report.vertexCount()); - V3Stats::addStat("MTask graph, final, edge count", report.edgeCount()); - V3Stats::addStat("MTask graph, final, parallelism factor", report.parallelismFactor()); - if (debug() >= 3) { - UINFO(0, "\n"); - UINFO(0, " Final mtask parallelism report:\n"); - UINFO(0, " Critical path cost = " << report.criticalPathCost() << "\n"); - UINFO(0, " Total graph cost = " << report.totalGraphCost() << "\n"); - UINFO(0, " MTask vertex count = " << report.vertexCount() << "\n"); - UINFO(0, " Edge count = " << report.edgeCount() << "\n"); - UINFO(0, " Parallelism factor = " << report.parallelismFactor() << "\n"); - } -} - -static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId, - AstCFunc* funcp, const ExecMTask* mtaskp) { - AstNodeModule* const modp = v3Global.rootp()->topModulep(); - FileLine* const fl = modp->fileline(); - - // Helper function to make the code a bit more legible - const auto addStrStmt = [=](const string& stmt) -> void { // - funcp->addStmtsp(new AstCStmt{fl, stmt}); - }; - - if (const uint32_t nDependencies = schedule.crossThreadDependencies(mtaskp)) { - // This mtask has dependencies executed on another thread, so it may block. Create the task - // state variable and wait to be notified. - const string name = "__Vm_mtaskstate_" + cvtToStr(mtaskp->id()); - AstBasicDType* const mtaskStateDtypep - = v3Global.rootp()->typeTablep()->findBasicDType(fl, VBasicDTypeKwd::MTASKSTATE); - AstVar* const varp = new AstVar{fl, VVarType::MODULETEMP, name, mtaskStateDtypep}; - varp->valuep(new AstConst{fl, nDependencies}); - varp->protect(false); // Do not protect as we still have references in AstText - modp->addStmtsp(varp); - // For now, reference is still via text bashing - addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n"); - } - - if (v3Global.opt.profPgo()) { - // No lock around startCounter, as counter numbers are unique per thread - addStrStmt("vlSymsp->_vm_pgoProfiler.startCounter(" + std::to_string(mtaskp->profilerId()) - + ");\n"); - } - - // Move the actual body into this function - funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack()); - - if (v3Global.opt.profPgo()) { - // No lock around stopCounter, as counter numbers are unique per thread - addStrStmt("vlSymsp->_vm_pgoProfiler.stopCounter(" + std::to_string(mtaskp->profilerId()) - + ");\n"); - } - - // For any dependent mtask that's on another thread, signal one dependency completion. - for (V3GraphEdge* edgep = mtaskp->outBeginp(); edgep; edgep = edgep->outNextp()) { - const ExecMTask* const nextp = edgep->top()->as(); - if (schedule.threadId(nextp) != threadId) { - addStrStmt("vlSelf->__Vm_mtaskstate_" + cvtToStr(nextp->id()) - + ".signalUpstreamDone(even_cycle);\n"); - } - } -} - -static const std::vector createThreadFunctions(const ThreadSchedule& schedule, - const string& tag) { - AstNodeModule* const modp = v3Global.rootp()->topModulep(); - FileLine* const fl = modp->fileline(); - - std::vector funcps; - - // For each thread, create a function representing its entry point - for (const std::vector& thread : schedule.threads) { - if (thread.empty()) continue; - const uint32_t threadId = schedule.threadId(thread.front()); - const string name{"__Vthread__" + tag + "__" + cvtToStr(threadId)}; - AstCFunc* const funcp = new AstCFunc{fl, name, nullptr, "void"}; - modp->addStmtsp(funcp); - funcps.push_back(funcp); - funcp->isStatic(true); // Uses void self pointer, so static and hand rolled - funcp->isLoose(true); - funcp->entryPoint(true); - funcp->argTypes("void* voidSelf, bool even_cycle"); - - // Setup vlSelf an vlSyms - funcp->addStmtsp(new AstCStmt{fl, EmitCBase::voidSelfAssign(modp)}); - funcp->addStmtsp(new AstCStmt{fl, EmitCBase::symClassAssign()}); - - // Invoke each mtask scheduled to this thread from the thread function - for (const ExecMTask* const mtaskp : thread) { - addMTaskToFunction(schedule, threadId, funcp, mtaskp); - } - - // Unblock the fake "final" mtask when this thread is finished - funcp->addStmtsp(new AstCStmt{fl, "vlSelf->__Vm_mtaskstate_final__" + tag - + ".signalUpstreamDone(even_cycle);\n"}); - } - - // Create the fake "final" mtask state variable - AstBasicDType* const mtaskStateDtypep - = v3Global.rootp()->typeTablep()->findBasicDType(fl, VBasicDTypeKwd::MTASKSTATE); - AstVar* const varp - = new AstVar{fl, VVarType::MODULETEMP, "__Vm_mtaskstate_final__" + tag, mtaskStateDtypep}; - varp->valuep(new AstConst(fl, funcps.size())); - varp->protect(false); // Do not protect as we still have references in AstText - modp->addStmtsp(varp); - - return funcps; -} - -static void addThreadStartToExecGraph(AstExecGraph* const execGraphp, - const std::vector& funcps) { - // FileLine used for constructing nodes below - FileLine* const fl = v3Global.rootp()->fileline(); - const string& tag = execGraphp->name(); - - // Add thread function invocations to execGraph - const auto addStrStmt = [=](const string& stmt) -> void { // - execGraphp->addStmtsp(new AstCStmt{fl, stmt}); - }; - const auto addTextStmt = [=](const string& text) -> void { - execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true}); - }; - - if (v3Global.opt.profExec()) { - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n"); - } - - addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag - + ";\n"); - - const uint32_t last = funcps.size() - 1; - for (uint32_t i = 0; i <= last; ++i) { - AstCFunc* const funcp = funcps.at(i); - if (i != last) { - // The first N-1 will run on the thread pool. - addTextStmt("vlSymsp->__Vm_threadPoolp->workerp(" + cvtToStr(i) + ")->addTask("); - execGraphp->addStmtsp(new AstAddrOfCFunc{fl, funcp}); - addTextStmt(", vlSelf, vlSymsp->__Vm_even_cycle__" + tag + ");\n"); - } else { - // The last will run on the main thread. - AstCCall* const callp = new AstCCall{fl, funcp}; - callp->dtypeSetVoid(); - callp->argTypes("vlSelf, vlSymsp->__Vm_even_cycle__" + tag); - execGraphp->addStmtsp(callp->makeStmt()); - addStrStmt("Verilated::mtaskId(0);\n"); - } - } - - addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag - + ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n"); - - if (v3Global.opt.profExec()) { - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n"); - } -} - -static void wrapMTaskBodies(AstExecGraph* const execGraphp) { - FileLine* const flp = execGraphp->fileline(); - const string& tag = execGraphp->name(); - AstNodeModule* const modp = v3Global.rootp()->topModulep(); - - for (AstMTaskBody* mtaskBodyp = execGraphp->mTaskBodiesp(); mtaskBodyp; - mtaskBodyp = VN_AS(mtaskBodyp->nextp(), MTaskBody)) { - ExecMTask* const mtaskp = mtaskBodyp->execMTaskp(); - const std::string name = tag + "_mtask" + std::to_string(mtaskp->id()); - AstCFunc* const funcp = new AstCFunc{flp, name, nullptr}; - funcp->isLoose(true); - modp->addStmtsp(funcp); - - // Helper function to make the code a bit more legible - const auto addStrStmt = [=](const string& stmt) -> void { // - funcp->addStmtsp(new AstCStmt{flp, stmt}); - }; - - if (v3Global.opt.profExec()) { - const string& id = std::to_string(mtaskp->id()); - const string& predictStart = std::to_string(mtaskp->predictStart()); - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(" + id + ", " + predictStart - + ");\n"); - } - - // Set mtask ID in the run-time system - addStrStmt("Verilated::mtaskId(" + std::to_string(mtaskp->id()) + ");\n"); - - // Run body - funcp->addStmtsp(mtaskBodyp->stmtsp()->unlinkFrBackWithNext()); - - // Flush message queue - addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n"); - - if (v3Global.opt.profExec()) { - const string& id = std::to_string(mtaskp->id()); - const string& predictConst = std::to_string(mtaskp->cost()); - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + id + ", " + predictConst - + ");\n"); - } - - // AstMTask will simply contain a call - AstCCall* const callp = new AstCCall{flp, funcp}; - callp->selfPointer(VSelfPointerText{VSelfPointerText::This{}}); - callp->dtypeSetVoid(); - mtaskBodyp->addStmtsp(callp->makeStmt()); - } -} - -static void implementExecGraph(AstExecGraph* const execGraphp) { - // Nothing to be done if there are no MTasks in the graph at all. - if (execGraphp->depGraphp()->empty()) return; - - // Schedule the mtasks: statically associate each mtask with a thread, - // and determine the order in which each thread will runs its mtasks. - const ThreadSchedule& schedule = PartPackMTasks{}.pack(*execGraphp->depGraphp()); - - // Create a function to be run by each thread. Note this moves all AstMTaskBody nodes form the - // AstExecGrap into the AstCFunc created - const std::vector& funcps = createThreadFunctions(schedule, execGraphp->name()); - UASSERT(!funcps.empty(), "Non-empty ExecGraph yields no threads?"); - - // Start the thread functions at the point this AstExecGraph is located in the tree. - addThreadStartToExecGraph(execGraphp, funcps); -} - -void V3Partition::finalize(AstNetlist* netlistp) { - // Called by Verilator top stage - netlistp->topModulep()->foreach([&](AstExecGraph* execGraphp) { - // Back in V3Order, we partitioned mtasks using provisional cost - // estimates. However, V3Order precedes some optimizations (notably - // V3LifePost) that can change the cost of logic within each mtask. - // Now that logic is final, recompute the cost and priority of each - // ExecMTask. - fillinCosts(execGraphp->depGraphp()); - finalizeCosts(execGraphp->depGraphp()); - - // Wrap each MTask body into a CFunc for better profiling/debugging - wrapMTaskBodies(execGraphp); - - // Replace the graph body with its multi-threaded implementation. - implementExecGraph(execGraphp); - }); -} - -void V3Partition::selfTest() { - UINFO(2, __FUNCTION__ << ": " << endl); - PartPropagateCpSelfTest::selfTest(); - PartPackMTasks::selfTest(); - PartContraction::selfTest(); -} diff --git a/src/V3Partition.h b/src/V3Partition.h deleted file mode 100644 index 592885951..000000000 --- a/src/V3Partition.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode: C++; c-file-style: "cc-mode" -*- -//************************************************************************* -// DESCRIPTION: Verilator: Threading's logic to mtask partitioner -// -// Code available from: https://verilator.org -// -//************************************************************************* -// -// Copyright 2003-2024 by Wilson Snyder. This program is free software; you -// can redistribute it and/or modify it under the terms of either the GNU -// Lesser General Public License Version 3 or the Perl Artistic License -// Version 2.0. -// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 -// -//************************************************************************* - -#ifndef VERILATOR_V3PARTITION_H_ -#define VERILATOR_V3PARTITION_H_ - -#include "config_build.h" -#include "verilatedos.h" - -#include "V3Graph.h" -#include "V3OrderGraph.h" -#include "V3ThreadSafety.h" - -#include -#include - -class LogicMTask; - -//************************************************************************* -/// V3Partition takes the fine-grained logic graph from V3Order and -/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each -/// of which contains of set of the logic nodes from the fine-grained -/// graph. - -class V3Partition final { - // MEMBERS - const OrderGraph* const m_orderGraphp; // The OrderGraph - const V3Graph* const m_fineDepsGraphp; // Fine-grained dependency graph - - LogicMTask* m_entryMTaskp = nullptr; // Singular source vertex of the dependency graph - LogicMTask* m_exitMTaskp = nullptr; // Singular sink vertex of the dependency graph - -public: - // CONSTRUCTORS - explicit V3Partition(const OrderGraph* orderGraphp, const V3Graph* fineDepsGraphp) - : m_orderGraphp{orderGraphp} - , m_fineDepsGraphp{fineDepsGraphp} {} - ~V3Partition() = default; - - // METHODS - - // Fill in the provided empty graph with AbstractLogicMTask's and their - // interdependencies. - void go(V3Graph* mtasksp) VL_MT_DISABLED; - - static void selfTest() VL_MT_DISABLED; - static void selfTestNormalizeCosts() VL_MT_DISABLED; - - // Print out a hash of the shape of graphp. Only needed to debug the - // origin of some nondeterminism; otherwise this is pretty useless. - static void hashGraphDebug(const V3Graph* graphp, const char* debugName) VL_MT_DISABLED; - - // Print debug stats about graphp whose nodes must be AbstractMTask's. - static void debugMTaskGraphStats(const V3Graph* graphp, const string& stage) VL_MT_DISABLED; - - // Operate on the final ExecMTask graph, immediately prior to code - // generation time. - static void finalize(AstNetlist* netlistp) VL_MT_DISABLED; - -private: - uint32_t setupMTaskDeps(V3Graph* mtasksp) VL_MT_DISABLED; - - VL_UNCOPYABLE(V3Partition); -}; - -#endif // Guard diff --git a/src/V3PartitionGraph.h b/src/V3PartitionGraph.h index 915787a63..04eed0136 100644 --- a/src/V3PartitionGraph.h +++ b/src/V3PartitionGraph.h @@ -25,48 +25,7 @@ #include -// Similar to OrderMoveVertex, but modified for threaded code generation. -class MTaskMoveVertex final : public V3GraphVertex { - VL_RTTI_IMPL(MTaskMoveVertex, V3GraphVertex) - // This could be more compact, since we know m_varp and m_logicp - // cannot both be set. Each MTaskMoveVertex represents a logic node - // or a var node, it can't be both. - OrderLogicVertex* const m_logicp; // Logic represented by this vertex - const AstSenTree* const m_domainp; - -public: - MTaskMoveVertex(V3Graph& graph, OrderLogicVertex* logicp, - const AstSenTree* domainp) VL_MT_DISABLED : V3GraphVertex{&graph}, - m_logicp{logicp}, - m_domainp{domainp} {} - ~MTaskMoveVertex() override = default; - - // ACCESSORS - OrderLogicVertex* logicp() const { return m_logicp; } - const AstScope* scopep() const { return m_logicp ? m_logicp->scopep() : nullptr; } - const AstSenTree* domainp() const { return m_domainp; } - - string dotColor() const override { - if (logicp()) { - return logicp()->dotColor(); - } else { - return "yellow"; - } - } - string name() const override { - string nm; - if (logicp()) { - nm = logicp()->name(); - nm += (string{"\\nMV:"} + " d=" + cvtToHex(logicp()->domainp()) + " s=" - + cvtToHex(logicp()->scopep()) - // "color()" represents the mtask ID. - + "\\nt=" + cvtToStr(color())); - } else { - nm = "nolog\\nt=" + cvtToStr(color()); - } - return nm; - } -}; +class MTaskMoveVertex; //************************************************************************* // MTasks and graph structures diff --git a/src/Verilator.cpp b/src/Verilator.cpp index c3c7cb289..029db638c 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -48,6 +48,7 @@ #include "V3EmitMk.h" #include "V3EmitV.h" #include "V3EmitXml.h" +#include "V3ExecGraph.h" #include "V3Expand.h" #include "V3File.h" #include "V3Force.h" @@ -71,10 +72,10 @@ #include "V3Localize.h" #include "V3MergeCond.h" #include "V3Name.h" +#include "V3Order.h" #include "V3Os.h" #include "V3Param.h" #include "V3ParseSym.h" -#include "V3Partition.h" #include "V3PreShell.h" #include "V3Premit.h" #include "V3ProtectLib.h" @@ -550,11 +551,10 @@ static void process() { } if (!v3Global.opt.serializeOnly() && v3Global.opt.mtasks()) { - // Finalize our MTask cost estimates and pack the mtasks into - // threads. Must happen pre-EmitC which relies on the packing - // order. Must happen post-V3LifePost which changes the relative - // costs of mtasks. - V3Partition::finalize(v3Global.rootp()); + // Implement the ExecGraphs by packing mtasks to thread. + // This should happen as late as possible (after all optimizations) + // as it relies on cost estimates. + V3ExecGraph::implement(v3Global.rootp()); } if (!v3Global.opt.lintOnly() && !v3Global.opt.serializeOnly() @@ -676,8 +676,8 @@ static void verilate(const string& argString) { V3Graph::selfTest(); V3TSP::selfTest(); V3ScoreboardBase::selfTest(); - V3Partition::selfTest(); - V3Partition::selfTestNormalizeCosts(); + V3Order::selfTestParallel(); + V3ExecGraph::selfTest(); V3PreShell::selfTest(); V3Broken::selfTest(); } diff --git a/test_regress/t/t_dotfiles.pl b/test_regress/t/t_dotfiles.pl index 9189fda39..e0c05a270 100755 --- a/test_regress/t/t_dotfiles.pl +++ b/test_regress/t/t_dotfiles.pl @@ -21,7 +21,7 @@ compile( foreach my $dotname ("linkcells", "task_call", "gate_graph", "gate_final", "acyc_simp", "orderg_pre", "orderg_acyc", "orderg_order", "orderg_domain", "ordermv_initial", "ordermv_hazards", "ordermv_contraction", - "ordermv_transitive1", "orderg_done", "ordermv_transitive2", "schedule") { + "ordermv_transitive1", "orderg_done", "schedule") { # Some files with identical prefix are generated multiple times during # Verilation. Ensure that at least one of each $dotname-prefixed file is generated. @dotFiles = glob("$Self->{obj_dir}/*$dotname.dot");