verilator/src/V3Sched.cpp

1134 lines
51 KiB
C++
Raw Normal View History

// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Code scheduling
//
// Code available from: https://verilator.org
//
//*************************************************************************
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of either the GNU Lesser General Public License Version 3
// or the Perl Artistic License Version 2.0.
// SPDX-FileCopyrightText: 2003-2026 Wilson Snyder
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//*************************************************************************
//
// V3Sched::schedule is the top level entry-point to the scheduling algorithm
// at a high level, the process is:
//
// - Gather and classify all logic in the design based on what triggers its execution
// - Schedule static, initial and final logic classes in source order
// - Break combinational cycles by introducing hybrid logic
// - Create 'settle' region that restores the combinational invariant
// - Partition the clocked and combinational (including hybrid) logic into pre/act/nba.
// All clocks (signals referenced in an AstSenTree) generated via a blocking assignment
// (including combinationally generated signals) are computed within the act region.
// - Replicate combinational logic
// - Create input combinational logic loop
// - Create the pre/act/nba triggers
// - Create the 'act' region evaluation function
// - Create the 'nba' region evaluation function
// - Bolt it all together to create the '_eval' function
//
// Details of the algorithm are described in the internals documentation docs/internals.rst
//
//*************************************************************************
#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT
2022-08-05 13:15:59 +02:00
#include "V3Sched.h"
#include "V3Const.h"
#include "V3EmitCBase.h"
#include "V3EmitV.h"
#include "V3Order.h"
#include "V3SenExprBuilder.h"
#include "V3Stats.h"
2022-09-22 18:28:42 +02:00
VL_DEFINE_DEBUG_FUNCTIONS;
namespace V3Sched {
namespace {
//============================================================================
// Utility functions
std::vector<const AstSenTree*> getSenTreesUsedBy(const std::vector<const LogicByScope*>& lbsps) {
const VNUser1InUse user1InUse;
std::vector<const AstSenTree*> result;
for (const LogicByScope* const lbsp : lbsps) {
for (const auto& pair : *lbsp) {
AstActive* const activep = pair.second;
AstSenTree* const senTreep = activep->sentreep();
if (senTreep->user1SetOnce()) continue;
if (senTreep->hasClocked() || senTreep->hasHybrid()) result.push_back(senTreep);
}
}
return result;
}
void remapSensitivities(const LogicByScope& lbs,
const std::unordered_map<const AstSenTree*, AstSenTree*>& senTreeMap) {
for (const auto& pair : lbs) {
AstActive* const activep = pair.second;
AstSenTree* const senTreep = activep->sentreep();
if (senTreep->hasCombo()) continue;
activep->sentreep(senTreeMap.at(senTreep));
}
}
void invertAndMergeSenTreeMap(
V3Order::TrigToSenMap& result,
const std::unordered_map<const AstSenTree*, AstSenTree*>& senTreeMap) {
for (const auto& pair : senTreeMap) result.emplace(pair.second, pair.first);
}
std::vector<AstSenTree*>
findTriggeredIface(const AstVarScope* vscp,
const VirtIfaceTriggers::IfaceMemberSensMap& vifMemberTriggered) {
const AstIface* ifacep;
if (vscp->varp()->isVirtIface()) {
// If `vscp->varp()->isVirtIface()` is true then the interface type that viface is pointing
// to is under `VN_AS(vscp->varp()->dtypep(), IfaceRefDType)->ifacep()`
ifacep = VN_AS(vscp->varp()->dtypep(), IfaceRefDType)->ifacep();
// Virtual interface is sensitive to a different interface type than it is a virtual type
// of - this may be a valid behaviour but this function does not expects that
UASSERT_OBJ(vscp->varp()->sensIfacep() == nullptr, vscp,
"Virtual interface has an ambiguous type - "
<< vscp->varp()->sensIfacep()->prettyTypeName()
<< " != " << ifacep->prettyTypeName());
} else {
// If `vscp->varp()` is of a non-virtual interface type it has `sensIfacep()` set to
// interface it is sensitive to
ifacep = vscp->varp()->sensIfacep();
}
UASSERT_OBJ(ifacep, vscp, "Variable is not sensitive for any interface");
std::vector<AstSenTree*> result;
for (const auto& memberIt : vifMemberTriggered) {
if (memberIt.first.m_ifacep == ifacep) result.push_back(memberIt.second);
}
UASSERT_OBJ(!result.empty(), vscp, "Did not find virtual interface trigger");
return result;
}
//============================================================================
// Eval loop builder
struct EvalLoop final {
// Flag set to true on entry to the first iteration of the loop
AstVarScope* firstIterp;
// The loop itself and statements around it
AstNodeStmt* stmtsp;
};
// Create an eval loop with all the trimmings.
EvalLoop createEvalLoop(
AstNetlist* netlistp, //
const std::string& tag, // Tag for current phase
const string& name, // Name of current phase
bool slow, // Should create slow functions
const TriggerKit& trigKit, // The trigger kit
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
AstVarScope* trigp, // The trigger vector - may be nullptr if no triggers or using 'condp'
AstNodeExpr* condp, // Explicit condition that must be true to run 'phaseWorkp'
AstNodeStmt* innerp, // The inner loop, if any
AstNodeStmt* phasePrepp, // Prep statements run before checking triggers
AstNodeStmt* phaseWorkp, // The work to do if anything triggered
// Extra statements to run after the work, even if no triggers fired. This function is
// passed a variable, which must be set to true if we must continue and loop again,
// and must be unmodified otherwise.
std::function<AstNodeStmt*(AstVarScope*)> phaseExtra = [](AstVarScope*) { return nullptr; } //
) {
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
UASSERT(!trigp || !condp, "Cannot use both 'trigp' and 'condp' in 'createEvalLoop'");
// All work is under a trigger or condition, so if there are none,
// there is nothing to do besides executing the inner loop.
if (!trigp && !condp) return {nullptr, innerp};
const std::string varPrefix = "__V" + tag;
AstScope* const scopeTopp = netlistp->topScopep()->scopep();
FileLine* const flp = netlistp->fileline();
// We wrap the prep/cond/work in a function for readability
AstCFunc* const phaseFuncp = util::makeTopFunction(netlistp, "_eval_phase__" + tag, slow);
{
// Add the preparatory statements
phaseFuncp->addStmtsp(phasePrepp);
// The execute flag
AstVarScope* const executeFlagp = scopeTopp->createTemp(varPrefix + "Execute", 1);
executeFlagp->varp()->noReset(true);
// If there is work in this phase, execute it if any triggers fired
if (phaseWorkp) {
AstNodeExpr* const lhsp = new AstVarRef{flp, executeFlagp, VAccess::WRITE};
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
// If using explicit condition, that directly determines whether to execute,
// otherwise check if any triggers are fired
AstNodeExpr* const rhsp = condp ? condp : trigKit.newAnySetCall(trigp);
phaseFuncp->addStmtsp(new AstAssign{flp, lhsp, rhsp});
// Add the work
AstIf* const ifp = new AstIf{flp, new AstVarRef{flp, executeFlagp, VAccess::READ}};
ifp->addThensp(phaseWorkp);
phaseFuncp->addStmtsp(ifp);
}
// Construct the extra statements
AstNodeStmt* const extraWorkp = phaseExtra(executeFlagp);
if (extraWorkp) phaseFuncp->addStmtsp(extraWorkp);
// The function returns ture iff it did run work
phaseFuncp->rtnType("bool");
AstNodeExpr* const retp
= phaseWorkp || extraWorkp
? static_cast<AstNodeExpr*>(new AstVarRef{flp, executeFlagp, VAccess::READ})
: static_cast<AstNodeExpr*>(new AstConst{flp, AstConst::BitFalse{}});
phaseFuncp->addStmtsp(new AstCReturn{flp, retp});
}
// The result statements
AstNodeStmt* stmtps = nullptr;
// Prof-exec section push
if (v3Global.opt.profExec()) { //
stmtps = AstCStmt::profExecSectionPush(flp, "loop " + tag);
}
2026-01-23 18:53:40 +01:00
const auto addVar = [&](const std::string& name, int width, uint32_t initVal, bool init) {
const string tempName{"__V" + tag + name};
AstVarScope* const vscp = tempName == "__VstlFirstIteration"
? netlistp->stlFirstIterationp()
: scopeTopp->createTemp(tempName, width);
vscp->varp()->noReset(true);
vscp->varp()->isInternal(true);
2026-01-23 18:53:40 +01:00
if (init) stmtps = AstNode::addNext(stmtps, util::setVar(vscp, initVal));
return vscp;
};
// The iteration counter
2026-01-23 18:53:40 +01:00
AstVarScope* const counterp = addVar("IterCount", 32, 0, true);
// The first iteration flag - cleared in 'phasePrepp' if used
2026-01-23 18:53:40 +01:00
AstVarScope* const firstIterFlagp = addVar("FirstIteration", 1, 1, true);
// Phase function result
AstVarScope* const phaseResultp = addVar("PhaseResult", 1, 0, false);
// The loop
{
AstLoop* const loopp = new AstLoop{flp};
stmtps->addNext(loopp);
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
// Check the iteration limit (aborts if exceeded). Dump triggers if using triggers.
AstNodeStmt* dumpCallp = trigp ? trigKit.newDumpCall(trigp, tag, false) : nullptr;
loopp->addStmtsp(util::checkIterationLimit(netlistp, name, counterp, dumpCallp));
// Increment the iteration counter
loopp->addStmtsp(util::incrementVar(counterp));
// Execute the inner loop
loopp->addStmtsp(innerp);
// Call the phase function to execute the current work. If we did
// work, then need to loop again, so set the continuation flag.
// If used, the first iteration flag is cleared when consumed, no
// need to reset it
AstCCall* const callp = new AstCCall{flp, phaseFuncp};
callp->dtypeSetBit();
2026-01-23 18:53:40 +01:00
AstAssign* const resultAssignp
= new AstAssign{flp, new AstVarRef{flp, phaseResultp, VAccess::WRITE}, callp};
loopp->addStmtsp(resultAssignp);
// Clear FirstIteration flag
AstAssign* const firstClearp
= new AstAssign{flp, new AstVarRef{flp, firstIterFlagp, VAccess::WRITE},
new AstConst{flp, AstConst::BitFalse()}};
loopp->addStmtsp(firstClearp);
// Continues until the continuation flag is clear
2026-01-23 18:53:40 +01:00
loopp->addStmtsp(
new AstLoopTest{flp, loopp, new AstVarRef{flp, phaseResultp, VAccess::READ}});
}
// Prof-exec section pop
if (v3Global.opt.profExec()) {
stmtps->addNext(AstCStmt::profExecSectionPop(flp, "loop " + tag));
}
return {firstIterFlagp, stmtps};
}
//============================================================================
// Collect and classify all logic in the design
LogicClasses gatherLogicClasses(AstNetlist* netlistp) {
LogicClasses result;
netlistp->foreach([&](AstScope* scopep) {
scopep->foreach([&](AstActive* activep) {
AstSenTree* const senTreep = activep->sentreep();
if (senTreep->hasStatic()) {
UASSERT_OBJ(!senTreep->sensesp()->nextp(), activep,
"static initializer with additional sensitivities");
result.m_static.emplace_back(scopep, activep);
} else if (senTreep->hasInitial()) {
UASSERT_OBJ(!senTreep->sensesp()->nextp(), activep,
"'initial' logic with additional sensitivities");
result.m_initial.emplace_back(scopep, activep);
} else if (senTreep->hasFinal()) {
UASSERT_OBJ(!senTreep->sensesp()->nextp(), activep,
"'final' logic with additional sensitivities");
result.m_final.emplace_back(scopep, activep);
} else if (senTreep->hasCombo()) {
UASSERT_OBJ(!senTreep->sensesp()->nextp(), activep,
"combinational logic with additional sensitivities");
if (VN_IS(activep->stmtsp(), AlwaysPostponed)) {
result.m_postponed.emplace_back(scopep, activep);
} else {
result.m_comb.emplace_back(scopep, activep);
}
} else {
UASSERT_OBJ(senTreep->hasClocked(), activep, "What else could it be?");
2022-12-23 13:34:49 +01:00
if (VN_IS(activep->stmtsp(), AlwaysObserved)) {
result.m_observed.emplace_back(scopep, activep);
} else if (VN_IS(activep->stmtsp(), AlwaysReactive)) {
result.m_reactive.emplace_back(scopep, activep);
} else {
result.m_clocked.emplace_back(scopep, activep);
}
}
});
});
return result;
}
//============================================================================
// Simple ordering in source order
void orderSequentially(AstCFunc* funcp, const LogicByScope& lbs) {
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
// Create new subfunc for scope
const auto createNewSubFuncp = [&](AstScope* const scopep) {
const string subName{funcp->name() + "__" + scopep->nameDotless()};
AstCFunc* const subFuncp = new AstCFunc{scopep->fileline(), subName, scopep};
subFuncp->isLoose(true);
subFuncp->isConst(false);
subFuncp->declPrivate(true);
subFuncp->slow(funcp->slow());
scopep->addBlocksp(subFuncp);
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
// Call it from the top function
funcp->addStmtsp(util::callVoidFunc(subFuncp));
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
return subFuncp;
};
const VNUser1InUse user1InUse; // AstScope -> AstCFunc: the sub-function for the scope
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
const VNUser2InUse user2InUse; // AstScope -> int: sub-function counter used for names
for (const auto& pair : lbs) {
AstScope* const scopep = pair.first;
AstActive* const activep = pair.second;
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
// Create a sub-function per scope so we can V3Combine them later
if (!scopep->user1p()) scopep->user1p(createNewSubFuncp(scopep));
// Add statements to sub-function
for (AstNode *logicp = activep->stmtsp(), *nextp; logicp; logicp = nextp) {
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
auto* subFuncp = VN_AS(scopep->user1p(), CFunc);
nextp = logicp->nextp();
if (AstNodeProcedure* const procp = VN_CAST(logicp, NodeProcedure)) {
if (AstNode* bodyp = procp->stmtsp()) {
bodyp->unlinkFrBackWithNext();
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
// If the process is suspendable, we need a separate function (a coroutine)
if (procp->isSuspendable()) {
funcp->slow(false);
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
subFuncp = createNewSubFuncp(scopep);
2023-11-21 03:02:56 +01:00
subFuncp->name(subFuncp->name() + "__Vtiming__"
+ cvtToStr(scopep->user2Inc()));
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
subFuncp->rtnType("VlCoroutine");
if (VN_IS(procp, Always)) {
subFuncp->slow(false);
FileLine* const flp = procp->fileline();
AstNodeExpr* const condp = new AstCExpr{
flp, "VL_LIKELY(!vlSymsp->_vm_contextp__->gotFinish())", 1};
AstLoop* const loopp = new AstLoop{flp};
loopp->addStmtsp(new AstLoopTest{flp, loopp, condp});
loopp->addStmtsp(bodyp);
bodyp = loopp;
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
}
}
subFuncp->addStmtsp(bodyp);
if (procp->needProcess()) subFuncp->setNeedProcess();
util::splitCheck(subFuncp);
}
} else {
logicp->unlinkFrBack();
subFuncp->addStmtsp(logicp);
}
}
if (activep->backp()) activep->unlinkFrBack();
VL_DO_DANGLING(activep->deleteTree(), activep);
}
}
//============================================================================
// Create simply ordered functions
AstCFunc* createStatic(AstNetlist* netlistp, const LogicClasses& logicClasses) {
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval_static", /* slow: */ true);
orderSequentially(funcp, logicClasses.m_static);
return funcp; // Not splitting yet as it is not final
}
void createInitial(AstNetlist* netlistp, const LogicClasses& logicClasses) {
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval_initial", /* slow: */ true);
orderSequentially(funcp, logicClasses.m_initial);
util::splitCheck(funcp);
}
AstCFunc* createPostponed(AstNetlist* netlistp, const LogicClasses& logicClasses) {
if (logicClasses.m_postponed.empty()) return nullptr;
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval_postponed", /* slow: */ true);
orderSequentially(funcp, logicClasses.m_postponed);
util::splitCheck(funcp);
return funcp;
}
void createFinal(AstNetlist* netlistp, const LogicClasses& logicClasses) {
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval_final", /* slow: */ true);
orderSequentially(funcp, logicClasses.m_final);
util::splitCheck(funcp);
}
//============================================================================
// Helper that creates virtual interface trigger resets
void addVirtIfaceTriggerAssignments(const VirtIfaceTriggers& virtIfaceTriggers,
uint32_t vifTriggerIndex, uint32_t vifMemberTriggerIndex,
const TriggerKit& trigKit) {
for (const auto& p : virtIfaceTriggers.m_memberTriggers) {
trigKit.addExtraTriggerAssignment(p.second, vifMemberTriggerIndex);
++vifMemberTriggerIndex;
}
}
// Order the combinational logic to create the settle loop
void createSettle(AstNetlist* netlistp, AstCFunc* const initFuncp, SenExprBuilder& senExprBulider,
LogicClasses& logicClasses) {
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval_settle", true);
// Clone, because ordering is destructive, but we still need them for "_eval"
LogicByScope comb = logicClasses.m_comb.clone();
LogicByScope hybrid = logicClasses.m_hybrid.clone();
// Nothing to do if there is no logic.
// While this is rare in real designs, it reduces noise in small tests.
if (comb.empty() && hybrid.empty()) return;
// We have an extra trigger denoting this is the first iteration of the settle loop
TriggerKit::ExtraTriggers extraTriggers;
const uint32_t firstIterationTrigger = extraTriggers.allocate("first iteration");
// Gather the relevant sensitivity expressions and create the trigger kit
const auto& senTreeps = getSenTreesUsedBy({&comb, &hybrid});
const TriggerKit trigKit = TriggerKit::create(netlistp, initFuncp, senExprBulider, {},
2026-02-11 19:35:59 +01:00
senTreeps, "stl", extraTriggers, true, false);
// Remap sensitivities (comb has none, so only do the hybrid)
remapSensitivities(hybrid, trigKit.mapVec());
// Create the inverse map from trigger ref AstSenTree to original AstSenTree
V3Order::TrigToSenMap trigToSen;
invertAndMergeSenTreeMap(trigToSen, trigKit.mapVec());
// First trigger is for pure combinational triggers (first iteration)
AstSenTree* const inputChanged
= trigKit.newExtraTriggerSenTree(trigKit.vscp(), firstIterationTrigger);
// Create and the body function
AstCFunc* const stlFuncp = V3Order::order(
netlistp, {&comb, &hybrid}, trigToSen, "stl", false, true,
[=](const AstVarScope*, std::vector<AstSenTree*>& out) { out.push_back(inputChanged); });
util::splitCheck(stlFuncp);
// Create the eval loop
const EvalLoop stlLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "stl", "Settle", /* slow: */ true, trigKit,
// Use trigger
trigKit.vscp(), nullptr,
// Explicit condition
// Inner loop statements
nullptr,
// Prep statements: Compute the current 'stl' triggers
2026-02-11 19:35:59 +01:00
[&trigKit] {
AstNodeStmt* const stmtp = trigKit.newCompBaseCall();
if (stmtp) stmtp->addNext(trigKit.newDumpCall(trigKit.vscp(), trigKit.name(), true));
return stmtp;
}(),
// Work statements: Invoke the 'stl' function
util::callVoidFunc(stlFuncp));
// Add the first iteration trigger to the trigger computation function
2026-01-23 18:53:40 +01:00
trigKit.addExtraTriggerAssignment(stlLoop.firstIterp, firstIterationTrigger, false);
// Add the eval loop to the top function
funcp->addStmtsp(stlLoop.stmtsp);
}
//============================================================================
// Order the replicated combinational logic to create the 'ico' region
AstNode* createInputCombLoop(AstNetlist* netlistp, AstCFunc* const initFuncp,
SenExprBuilder& senExprBuilder, LogicByScope& logic,
const VirtIfaceTriggers& virtIfaceTriggers) {
// Nothing to do if no combinational logic is sensitive to top level inputs
if (logic.empty()) return nullptr;
// SystemC only: Any top level inputs feeding a combinational logic must be marked,
// so we can make them sc_sensitive
if (v3Global.opt.systemC()) {
logic.foreachLogic([](AstNode* logicp) {
logicp->foreach([](AstVarRef* refp) {
if (refp->access().isWriteOnly()) return;
AstVarScope* const vscp = refp->varScopep();
if (vscp->scopep()->isTop() && vscp->varp()->isNonOutput()) {
vscp->varp()->scSensitive(true);
}
});
});
}
// We have some extra trigger denoting external conditions
AstVarScope* const dpiExportTriggerVscp = netlistp->dpiExportTriggerp();
TriggerKit::ExtraTriggers extraTriggers;
const uint32_t firstIterationTrigger = extraTriggers.allocate("first iteration");
const uint32_t dpiExportTriggerIndex = dpiExportTriggerVscp
? extraTriggers.allocate("DPI export trigger")
: std::numeric_limits<uint32_t>::max();
const size_t firstVifTriggerIndex = extraTriggers.size();
const size_t firstVifMemberTriggerIndex = extraTriggers.size();
for (const auto& p : virtIfaceTriggers.m_memberTriggers) {
const auto& item = p.first;
extraTriggers.allocate("virtual interface member: " + item.m_ifacep->name() + "."
+ item.m_memberp->name());
}
// Gather the relevant sensitivity expressions and create the trigger kit
const auto& senTreeps = getSenTreesUsedBy({&logic});
const TriggerKit trigKit = TriggerKit::create(netlistp, initFuncp, senExprBuilder, {},
2026-02-11 19:35:59 +01:00
senTreeps, "ico", extraTriggers, false, false);
std::ignore = senExprBuilder.getAndClearResults();
if (dpiExportTriggerVscp) {
trigKit.addExtraTriggerAssignment(dpiExportTriggerVscp, dpiExportTriggerIndex);
}
addVirtIfaceTriggerAssignments(virtIfaceTriggers, firstVifTriggerIndex,
firstVifMemberTriggerIndex, trigKit);
// Remap sensitivities
remapSensitivities(logic, trigKit.mapVec());
// Create the inverse map from trigger ref AstSenTree to original AstSenTree
V3Order::TrigToSenMap trigToSen;
invertAndMergeSenTreeMap(trigToSen, trigKit.mapVec());
// The trigger top level inputs (first iteration)
AstSenTree* const inputChanged
= trigKit.newExtraTriggerSenTree(trigKit.vscp(), firstIterationTrigger);
// The DPI Export trigger
AstSenTree* const dpiExportTriggered
= dpiExportTriggerVscp
? trigKit.newExtraTriggerSenTree(trigKit.vscp(), dpiExportTriggerIndex)
: nullptr;
const auto& vifMemberTriggeredIco = virtIfaceTriggers.makeMemberToSensMap(
trigKit, firstVifMemberTriggerIndex, trigKit.vscp());
// Create and Order the body function
AstCFunc* const icoFuncp = V3Order::order(
netlistp, {&logic}, trigToSen, "ico", false, false,
[=](const AstVarScope* vscp, std::vector<AstSenTree*>& out) {
AstVar* const varp = vscp->varp();
if (varp->isPrimaryInish() || varp->isSigUserRWPublic()) {
out.push_back(inputChanged);
}
if (varp->isWrittenByDpi()) out.push_back(dpiExportTriggered);
if (vscp->varp()->isVirtIface()) {
std::vector<AstSenTree*> ifaceTriggered
= findTriggeredIface(vscp, vifMemberTriggeredIco);
out.insert(out.end(), ifaceTriggered.begin(), ifaceTriggered.end());
}
});
util::splitCheck(icoFuncp);
// Create the eval loop
const EvalLoop icoLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "ico", "Input combinational", /* slow: */ false, trigKit,
// Use trigger
trigKit.vscp(), nullptr,
// Inner loop statements
nullptr,
// Prep statements: Compute the current 'ico' triggers
2026-02-11 19:35:59 +01:00
[&trigKit] {
AstNodeStmt* const stmtp = trigKit.newCompBaseCall();
if (stmtp) stmtp->addNext(trigKit.newDumpCall(trigKit.vscp(), trigKit.name(), true));
return stmtp;
}(),
// Work statements: Invoke the 'ico' function
util::callVoidFunc(icoFuncp));
// Add the first iteration trigger to the trigger computation function
2026-01-23 18:53:40 +01:00
trigKit.addExtraTriggerAssignment(icoLoop.firstIterp, firstIterationTrigger, false);
return icoLoop.stmtsp;
}
//============================================================================
// EvalKit groups items that have to be passed to createEval() for a given eval region
2022-12-23 13:34:49 +01:00
struct EvalKit final {
// The AstVarScope representing the region's trigger vector
AstVarScope* const m_vscp = nullptr;
// The AstCFunc that evaluates the region's logic
AstCFunc* const m_funcp = nullptr;
// Is this kit used/required?
bool empty() const { return !m_funcp; }
};
2022-12-23 13:34:49 +01:00
//============================================================================
// Bolt together parts to create the top level _eval function
void createEval(AstNetlist* netlistp, //
AstNode* icoLoop, //
const TriggerKit& trigKit, //
2022-12-23 13:34:49 +01:00
const EvalKit& actKit, //
const EvalKit& nbaKit, //
const EvalKit& obsKit, //
const EvalKit& reactKit, //
AstCFunc* postponedFuncp, //
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
TimingKit& timingKit //
) {
FileLine* const flp = netlistp->fileline();
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
// Grab the delay scheduler variable, if any
AstVarScope* const delaySchedVscp = timingKit.getDelayScheduler(netlistp);
2026-02-11 19:35:59 +01:00
// 'createResume' consumes the contents that 'createReady' needs, so do the right order
AstCCall* const timingReadyp = timingKit.createReady(netlistp);
AstCCall* const timingResumep = timingKit.createResume(netlistp);
// Create the active eval loop
EvalLoop topLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "act", "Active", /* slow: */ false, trigKit,
// Use trigger
actKit.m_vscp, nullptr,
// Inner loop statements
nullptr,
// Prep statements
[&]() {
// Compute the current 'act' triggers - the NBA triggers are the latched value
2026-02-11 19:35:59 +01:00
AstNodeStmt* stmtsp = trigKit.newCompBaseCall();
AstNodeStmt* const dumpp
= stmtsp ? trigKit.newDumpCall(trigKit.vscp(), trigKit.name(), true) : nullptr;
// Mark as ready for triggered awaits
if (timingReadyp) stmtsp = AstNode::addNext(stmtsp, timingReadyp->makeStmt());
if (AstVarScope* const vscAccp = trigKit.vscAccp()) {
stmtsp = AstNode::addNext(stmtsp, trigKit.newOrIntoCall(actKit.m_vscp, vscAccp));
}
stmtsp = AstNode::addNext(stmtsp, trigKit.newCompExtCall(nbaKit.m_vscp));
stmtsp = AstNode::addNext(stmtsp, dumpp);
// Latch the 'act' triggers under the 'nba' triggers
stmtsp = AstNode::addNext(stmtsp, trigKit.newOrIntoCall(nbaKit.m_vscp, actKit.m_vscp));
//
return stmtsp;
}(),
// Work statements
[&]() {
AstNodeStmt* workp = nullptr;
2026-02-11 19:35:59 +01:00
if (AstVarScope* const actAccp = trigKit.vscAccp()) {
AstCMethodHard* const cCallp = new AstCMethodHard{
flp, new AstVarRef{flp, actAccp, VAccess::WRITE}, VCMethod::UNPACKED_FILL,
new AstConst{flp, AstConst::Unsized64{}, 0}};
cCallp->dtypeSetVoid();
workp = AstNode::addNext(workp, cCallp->makeStmt());
}
// Resume triggered timing schedulers
2026-02-11 19:35:59 +01:00
if (timingResumep) workp = AstNode::addNext(workp, timingResumep->makeStmt());
// Invoke the 'act' function
workp = AstNode::addNext(workp, util::callVoidFunc(actKit.m_funcp));
//
return workp;
}());
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
// Create if there are any delays, so we can check at runtime if a #0 is unexpected
if (delaySchedVscp) {
topLoop = createEvalLoop( //
netlistp, "inact", "Inactive", /* slow: */ false, trigKit,
// Use explicit condition
nullptr,
[&]() {
// Run if any zero delays are pending
AstNodeExpr* const callp
= new AstCMethodHard{flp, new AstVarRef{flp, delaySchedVscp, VAccess::READ},
VCMethod::SCHED_AWAITING_ZERO_DELAY};
callp->dtypeSetBit();
return callp;
}(),
// Inner loop statements
topLoop.stmtsp,
// Prep statements
nullptr,
// Work statements
[&]() -> AstNodeStmt* {
if (v3Global.usesZeroDelay()) {
// Resume processes watiting for #0 delay
AstCMethodHard* const callp = new AstCMethodHard{
flp, new AstVarRef{flp, delaySchedVscp, VAccess::READWRITE},
VCMethod::SCHED_RESUME_ZERO_DELAY};
callp->dtypeSetVoid();
return callp->makeStmt();
} else {
// Assumption was that the design doesn't use #0 delays.
// Die at run-time if it does.
AstCStmt* const stmtp = new AstCStmt{flp};
const FileLine* const locp = netlistp->topModulep()->fileline();
const std::string& file = VIdProtect::protect(locp->filename());
const std::string& line = std::to_string(locp->lineno());
stmtp->add(
"VL_FATAL_MT(\"" + V3OutFormatter::quoteNameControls(file) + "\", " + line
+ ", \"\", \"ZERODLY: Design Verilated with '--no-sched-zero-delay', "
+ "but #0 delay executed at runtime\");");
return stmtp;
}
}());
}
// Create the NBA eval loop, which is the default top level loop.
topLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "nba", "NBA", /* slow: */ false, trigKit,
// Use trigger
nbaKit.m_vscp, nullptr,
// Inner loop statements
topLoop.stmtsp,
// Prep statements
nullptr,
// Work statements
[&]() {
AstNodeStmt* workp = nullptr;
// Latch the 'nba' trigger flags under the following region's trigger flags
if (!obsKit.empty()) {
workp = trigKit.newOrIntoCall(obsKit.m_vscp, nbaKit.m_vscp);
} else if (!reactKit.empty()) {
workp = trigKit.newOrIntoCall(reactKit.m_vscp, nbaKit.m_vscp);
}
// Invoke the 'nba' function
workp = AstNode::addNext(workp, util::callVoidFunc(nbaKit.m_funcp));
// Clear the 'nba' triggers
workp = AstNode::addNext(workp, trigKit.newClearCall(nbaKit.m_vscp));
//
return workp;
}(),
// Extra work (not conditional on having had a fired trigger)
[&](AstVarScope* continuep) -> AstNodeStmt* {
// Check if any dynamic NBAs are pending, if there are any in the design
if (!netlistp->nbaEventp()) return nullptr;
AstVarScope* const nbaEventp = netlistp->nbaEventp();
AstVarScope* const nbaEventTriggerp = netlistp->nbaEventTriggerp();
UASSERT(nbaEventTriggerp, "NBA event trigger var should exist");
netlistp->nbaEventp(nullptr);
netlistp->nbaEventTriggerp(nullptr);
2026-02-11 19:35:59 +01:00
// If a dynamic NBA is pending, clear the pending flag and fire the ready event
AstIf* const ifp = new AstIf{flp, new AstVarRef{flp, nbaEventTriggerp, VAccess::READ}};
ifp->addThensp(util::setVar(continuep, 1));
ifp->addThensp(util::setVar(nbaEventTriggerp, 0));
AstCMethodHard* const firep = new AstCMethodHard{
flp, new AstVarRef{flp, nbaEventp, VAccess::WRITE}, VCMethod::EVENT_FIRE};
firep->dtypeSetVoid();
ifp->addThensp(firep->makeStmt());
return ifp;
});
if (!obsKit.empty()) {
// Create the Observed eval loop, which becomes the top level loop.
topLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "obs", "Observed", /* slow: */ false, trigKit,
// Use trigger
obsKit.m_vscp, nullptr,
// Inner loop statements
topLoop.stmtsp,
// Prep statements
nullptr,
// Work statements
[&]() {
AstNodeStmt* workp = nullptr;
// Latch the Observed trigger flags under the Reactive trigger flags
if (!reactKit.empty()) {
workp = trigKit.newOrIntoCall(reactKit.m_vscp, obsKit.m_vscp);
}
// Invoke the 'obs' function
workp = AstNode::addNext(workp, util::callVoidFunc(obsKit.m_funcp));
// Clear the 'obs' triggers
workp = AstNode::addNext(workp, trigKit.newClearCall(obsKit.m_vscp));
//
return workp;
}());
2022-12-23 13:34:49 +01:00
}
if (!reactKit.empty()) {
// Create the Reactive eval loop, which becomes the top level loop.
topLoop = createEvalLoop( //
Support #0 delays with IEEE-1800 compliant semantics (#7079) This patch adds IEEE-1800 compliant scheduling support for the Inactive scheduling region used for #0 delays. Implementing this requires that **all** IEEE-1800 active region events are placed in the internal 'act' section. This has simulation performance implications. It prevents some optimizations (e.g. V3LifePost), which reduces single threaded performance. It also reduces the available work and parallelism in the internal 'nba' section, which reduced the effectiveness of multi-threading severely. Performance impact on RTLMeter when using scheduling adjusted to support proper #0 delays is ~10-20% slowdown in single-threaded mode, and ~100% (2x slower) with --threads 4. To avoid paying this performance penalty unconditionally, the scheduling is only adjusted if either: 1. The input contains a statically known #0 delay 2. The input contains a variable #x delay unknown at compile time If no #0 is present, but #x variable delays are, a ZERODLY warning is issued advising the use of '--no-sched-zero-delay' which is a promise by the user that none of the variable delays will evaluate to a zero delay at run-time. This warning is turned off if '--sched-zero-delay' is explicitly given. This is similar to the '--timing' option. If '--no-sched-zero-delay' was used at compile time, then executing a zero delay will fail at runtime. A ZERODLY warning is also issued if a static #0 if found, but the user specified '--no-sched-zero-delay'. In this case the scheduling is not adjusted to support #0, so executing it will fail at runtime. Presumably the user knows it won't be executed. The intended behaviour with all this is the following: No #0, no #var in the design (#constant is OK) -> Same as current behaviour, scheduling not adjusted, same code generated as before Has static #0 and '--no-sched-zero-delay' is NOT given: -> No warnings, scheduling adjusted so it just works, runs slow Has static #0 and '--no-sched-zero-delay' is given: -> ZERODLY on the #0, scheduling not adjusted, fails at runtime if hit No static #0, but has #var and no option is given: -> ZERODLY on the #var advising use of '--no-sched-zero-delay' or '--sched-zero-delay' (similar to '--timing'), scheduling adjusted assuming it can be a zero delay and it just works No static #0, but has #var and '--no-sched-zero-delay' is given: -> No warning, scheduling not adjusted, fails at runtime if zero delay No static #0, but has #var and '--sched-zero-delay' is given: -> No warning, scheduling adjusted so it just works
2026-02-16 04:55:55 +01:00
netlistp, "react", "Reactive", /* slow: */ false, trigKit,
// Use trigger
reactKit.m_vscp, nullptr,
// Inner loop statements
topLoop.stmtsp,
// Prep statements
nullptr,
// Work statements
[&]() {
// Invoke the 'react' function
AstNodeStmt* workp = util::callVoidFunc(reactKit.m_funcp);
// Clear the 'react' triggers
workp = AstNode::addNext(workp, trigKit.newClearCall(reactKit.m_vscp));
return workp;
}());
2022-12-23 13:34:49 +01:00
}
// Now that we have build the loops, create the main 'eval' function
AstCFunc* const funcp = util::makeTopFunction(netlistp, "_eval", false);
netlistp->evalp(funcp);
if (v3Global.opt.profExec()) funcp->addStmtsp(AstCStmt::profExecSectionPush(flp, "eval"));
// Start with the ico loop, if any
if (icoLoop) funcp->addStmtsp(icoLoop);
// Execute the top level eval loop
funcp->addStmtsp(topLoop.stmtsp);
// Add the Postponed eval call
if (postponedFuncp) funcp->addStmtsp(util::callVoidFunc(postponedFuncp));
if (v3Global.opt.profExec()) funcp->addStmtsp(AstCStmt::profExecSectionPop(flp, "eval"));
}
} // namespace
//============================================================================
// Helper that builds virtual interface trigger sentrees
VirtIfaceTriggers::IfaceMemberSensMap
VirtIfaceTriggers::makeMemberToSensMap(const TriggerKit& trigKit, uint32_t vifTriggerIndex,
AstVarScope* trigVscp) const {
IfaceMemberSensMap map;
for (const auto& p : m_memberTriggers) {
map.emplace(p.first, trigKit.newExtraTriggerSenTree(trigVscp, vifTriggerIndex));
++vifTriggerIndex;
}
return map;
}
std::unordered_map<const AstSenTree*, AstSenTree*>
cloneMapWithNewTriggerReferences(const std::unordered_map<const AstSenTree*, AstSenTree*>& map,
AstVarScope* vscp) {
AstTopScope* const topScopep = v3Global.rootp()->topScopep();
// Copy map
std::unordered_map<const AstSenTree*, AstSenTree*> newMap{map};
// Replace references in each mapped value with a reference to the given vscp
for (auto& pair : newMap) {
pair.second = pair.second->cloneTree(false);
pair.second->foreach([&](AstVarRef* refp) {
UASSERT_OBJ(refp->access() == VAccess::READ, refp, "Should be read ref");
refp->replaceWith(new AstVarRef{refp->fileline(), vscp, VAccess::READ});
VL_DO_DANGLING(refp->deleteTree(), refp);
});
topScopep->addSenTreesp(pair.second);
}
return newMap;
}
//============================================================================
// Top level entry-point to scheduling
void schedule(AstNetlist* netlistp) {
const auto addSizeStat = [](const string& name, const LogicByScope& lbs) {
uint64_t size = 0;
lbs.foreachLogic([&](AstNode* nodep) { size += nodep->nodeCount(); });
V3Stats::addStat("Scheduling, " + name, size);
};
// Step 0. Prepare external domains for timing and virtual interfaces
// Create extra triggers for virtual interfaces
const auto& virtIfaceTriggers = makeVirtIfaceTriggers(netlistp);
// Prepare timing-related logic and external domains
TimingKit timingKit = prepareTiming(netlistp);
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
// Step 1. Gather and classify all logic in the design
LogicClasses logicClasses = gatherLogicClasses(netlistp);
if (v3Global.opt.stats()) {
V3Stats::statsStage("sched-gather");
addSizeStat("size of class: static", logicClasses.m_static);
addSizeStat("size of class: initial", logicClasses.m_initial);
addSizeStat("size of class: final", logicClasses.m_final);
}
// Step 2. Schedule static, initial and final logic classes in source order
AstCFunc* const staticp = createStatic(netlistp, logicClasses);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-static");
createInitial(netlistp, logicClasses);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-initial");
createFinal(netlistp, logicClasses);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-final");
// Step 3: Break combinational cycles by introducing hybrid logic
// Note: breakCycles also removes corresponding logic from logicClasses.m_comb;
logicClasses.m_hybrid = breakCycles(netlistp, logicClasses.m_comb);
if (v3Global.opt.stats()) {
addSizeStat("size of class: clocked", logicClasses.m_clocked);
addSizeStat("size of class: combinational", logicClasses.m_comb);
addSizeStat("size of class: hybrid", logicClasses.m_hybrid);
V3Stats::statsStage("sched-break-cycles");
}
// We pass around a single SenExprBuilder instance, as we only need one set of 'prev' variables
// for edge/change detection in sensitivity expressions, which this keeps track of.
AstTopScope* const topScopep = netlistp->topScopep();
AstScope* const scopeTopp = topScopep->scopep();
SenExprBuilder senExprBuilder{scopeTopp};
// Step 4: Create 'settle' region that restores the combinational invariant
createSettle(netlistp, staticp, senExprBuilder, logicClasses);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-settle");
// Step 5: Partition the clocked and combinational (including hybrid) logic into pre/act/nba.
// All clocks (signals referenced in an AstSenTree) generated via a blocking assignment
// (including combinationally generated signals) are computed within the act region.
LogicRegions logicRegions
= partition(logicClasses.m_clocked, logicClasses.m_comb, logicClasses.m_hybrid);
logicRegions.m_obs = logicClasses.m_observed;
logicRegions.m_react = logicClasses.m_reactive;
if (v3Global.opt.stats()) {
addSizeStat("size of region: Active Pre", logicRegions.m_pre);
addSizeStat("size of region: Active", logicRegions.m_act);
addSizeStat("size of region: NBA", logicRegions.m_nba);
addSizeStat("size of region: Observed", logicRegions.m_obs);
addSizeStat("size of region: Reactive", logicRegions.m_react);
V3Stats::statsStage("sched-partition");
}
// Step 6: Replicate combinational logic
LogicReplicas logicReplicas = replicateLogic(logicRegions);
if (v3Global.opt.stats()) {
addSizeStat("size of replicated logic: Input", logicReplicas.m_ico);
addSizeStat("size of replicated logic: Active", logicReplicas.m_act);
addSizeStat("size of replicated logic: NBA", logicReplicas.m_nba);
addSizeStat("size of replicated logic: Observed", logicReplicas.m_obs);
addSizeStat("size of replicated logic: Reactive", logicReplicas.m_react);
V3Stats::statsStage("sched-replicate");
}
// Step 7: Create input combinational logic loop
AstNode* const icoLoopp = createInputCombLoop(netlistp, staticp, senExprBuilder,
logicReplicas.m_ico, virtIfaceTriggers);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-create-ico");
// Step 8: Create the triggers
AstVarScope* const dpiExportTriggerVscp = netlistp->dpiExportTriggerp();
netlistp->dpiExportTriggerp(nullptr); // Finished with this here
// We may have an extra trigger for variable updated in DPI exports
TriggerKit::ExtraTriggers extraTriggers;
const uint32_t dpiExportTriggerIndex = dpiExportTriggerVscp
? extraTriggers.allocate("DPI export trigger")
: std::numeric_limits<uint32_t>::max();
const uint32_t firstVifTriggerIndex = extraTriggers.size();
const uint32_t firstVifMemberTriggerIndex = extraTriggers.size();
for (const auto& p : virtIfaceTriggers.m_memberTriggers) {
const auto& item = p.first;
extraTriggers.allocate("virtual interface member: " + item.m_ifacep->name() + "."
+ item.m_memberp->name());
}
const auto& preTreeps = getSenTreesUsedBy({&logicRegions.m_pre});
const auto& senTreeps = getSenTreesUsedBy({&logicRegions.m_act, //
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
&logicRegions.m_nba, //
&logicRegions.m_obs, //
&logicRegions.m_react, //
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
&timingKit.m_lbs});
2026-02-11 19:35:59 +01:00
const TriggerKit trigKit
= TriggerKit::create(netlistp, staticp, senExprBuilder, preTreeps, senTreeps, "act",
extraTriggers, false, v3Global.usesTiming());
// Add post updates from the timing kit
2026-02-11 19:35:59 +01:00
if (timingKit.m_postUpdates) trigKit.compBasep()->addStmtsp(timingKit.m_postUpdates);
if (dpiExportTriggerVscp) {
trigKit.addExtraTriggerAssignment(dpiExportTriggerVscp, dpiExportTriggerIndex);
}
addVirtIfaceTriggerAssignments(virtIfaceTriggers, firstVifTriggerIndex,
firstVifMemberTriggerIndex, trigKit);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-create-triggers");
// Note: Experiments so far show that running the Act (or Ico) regions on
// multiple threads is always a net loss, so only use multi-threading for
// NBA for now. This can be revised if evidence is available that it would
// be beneficial
// Step 9: Create the 'act' region evaluation function
// Remap sensitivities of the input logic to the triggers
remapSensitivities(logicRegions.m_pre, trigKit.mapPre());
remapSensitivities(logicRegions.m_act, trigKit.mapVec());
remapSensitivities(logicReplicas.m_act, trigKit.mapVec());
remapSensitivities(timingKit.m_lbs, trigKit.mapVec());
const std::map<const AstVarScope*, std::vector<AstSenTree*>> actTimingDomains
= timingKit.remapDomains(trigKit.mapVec());
// Create the inverse map from trigger ref AstSenTree to original AstSenTree
V3Order::TrigToSenMap trigToSenAct;
invertAndMergeSenTreeMap(trigToSenAct, trigKit.mapPre());
invertAndMergeSenTreeMap(trigToSenAct, trigKit.mapVec());
// The DPI Export trigger AstSenTree
AstSenTree* const dpiExportTriggeredAct
= dpiExportTriggerVscp
? trigKit.newExtraTriggerSenTree(trigKit.vscp(), dpiExportTriggerIndex)
: nullptr;
const auto& vifMemberTriggeredAct = virtIfaceTriggers.makeMemberToSensMap(
trigKit, firstVifMemberTriggerIndex, trigKit.vscp());
AstCFunc* const actFuncp = V3Order::order(
netlistp, {&logicRegions.m_pre, &logicRegions.m_act, &logicReplicas.m_act}, trigToSenAct,
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
"act", false, false, [&](const AstVarScope* vscp, std::vector<AstSenTree*>& out) {
auto it = actTimingDomains.find(vscp);
if (it != actTimingDomains.end()) out = it->second;
if (vscp->varp()->isWrittenByDpi()) out.push_back(dpiExportTriggeredAct);
if (vscp->varp()->isVirtIface()) {
std::vector<AstSenTree*> ifaceTriggered
= findTriggeredIface(vscp, vifMemberTriggeredAct);
out.insert(out.end(), ifaceTriggered.begin(), ifaceTriggered.end());
}
});
util::splitCheck(actFuncp);
if (v3Global.opt.stats()) V3Stats::statsStage("sched-create-act");
const EvalKit actKit{trigKit.vscp(), actFuncp};
2022-12-23 13:34:49 +01:00
// Orders a region's logic and creates the region eval function
const auto order = [&](const std::string& name,
const std::vector<V3Sched::LogicByScope*>& logic) -> EvalKit {
UINFO(2, "Scheduling " << name << " #logic = " << logic.size());
AstVarScope* const trigVscp = trigKit.newTrigVec(name);
const auto trigMap = cloneMapWithNewTriggerReferences(trigKit.mapVec(), trigVscp);
2022-12-23 13:34:49 +01:00
// Remap sensitivities of the input logic to the triggers
for (LogicByScope* lbs : logic) remapSensitivities(*lbs, trigMap);
// Create the inverse map from trigger ref AstSenTree to original AstSenTree
V3Order::TrigToSenMap trigToSen;
2022-12-23 13:34:49 +01:00
invertAndMergeSenTreeMap(trigToSen, trigMap);
AstSenTree* const dpiExportTriggered
= dpiExportTriggerVscp
? trigKit.newExtraTriggerSenTree(trigVscp, dpiExportTriggerIndex)
: nullptr;
const auto& vifMemberTriggered
= virtIfaceTriggers.makeMemberToSensMap(trigKit, firstVifMemberTriggerIndex, trigVscp);
2022-12-23 13:34:49 +01:00
const auto& timingDomains = timingKit.remapDomains(trigMap);
AstCFunc* const funcp = V3Order::order(
netlistp, logic, trigToSen, name, name == "nba" && v3Global.opt.mtasks(), false,
[&](const AstVarScope* vscp, std::vector<AstSenTree*>& out) {
auto it = timingDomains.find(vscp);
if (it != timingDomains.end()) out = it->second;
if (vscp->varp()->isWrittenByDpi()) out.push_back(dpiExportTriggered);
// Sometimes virtual interfaces mix with non-virtual one so, here both have to be
// detected - look `t_virtual_interface_nba_assign`
if (vscp->varp()->sensIfacep() || vscp->varp()->isVirtIface()) {
std::vector<AstSenTree*> ifaceTriggered
= findTriggeredIface(vscp, vifMemberTriggered);
out.insert(out.end(), ifaceTriggered.begin(), ifaceTriggered.end());
}
2022-12-23 13:34:49 +01:00
});
return {trigVscp, funcp};
2022-12-23 13:34:49 +01:00
};
// Step 10: Create the 'nba' region evaluation function
const EvalKit nbaKit = order("nba", {&logicRegions.m_nba, &logicReplicas.m_nba});
util::splitCheck(nbaKit.m_funcp);
2022-12-23 13:34:49 +01:00
netlistp->evalNbap(nbaKit.m_funcp); // Remember for V3LifePost
if (v3Global.opt.stats()) V3Stats::statsStage("sched-create-nba");
2022-12-23 13:34:49 +01:00
// Orders a region's logic and creates the region eval function (only if there is any logic in
// the region)
const auto orderIfNonEmpty
= [&](const std::string& name, const std::vector<LogicByScope*>& logic) -> EvalKit {
if (logic[0]->empty())
return {}; // if region is empty, replica is supposed to be empty as well
const auto& kit = order(name, logic);
2022-12-23 13:34:49 +01:00
if (v3Global.opt.stats()) V3Stats::statsStage("sched-create-" + name);
return kit;
};
// Step 11: Create the 'obs' region evaluation function
const EvalKit obsKit = orderIfNonEmpty("obs", {&logicRegions.m_obs, &logicReplicas.m_obs});
2022-12-23 13:34:49 +01:00
// Step 12: Create the 're' region evaluation function
const EvalKit reactKit
= orderIfNonEmpty("react", {&logicRegions.m_react, &logicReplicas.m_react});
2022-12-23 13:34:49 +01:00
// Step 13: Create the 'postponed' region evaluation function
auto* const postponedFuncp = createPostponed(netlistp, logicClasses);
2022-12-23 13:34:49 +01:00
// Step 14: Bolt it all together to create the '_eval' function
createEval(netlistp, icoLoopp, trigKit, actKit, nbaKit, obsKit, reactKit, postponedFuncp,
timingKit);
Timing support (#3363) Adds timing support to Verilator. It makes it possible to use delays, event controls within processes (not just at the start), wait statements, and forks. Building a design with those constructs requires a compiler that supports C++20 coroutines (GCC 10, Clang 5). The basic idea is to have processes and tasks with delays/event controls implemented as C++20 coroutines. This allows us to suspend and resume them at any time. There are five main runtime classes responsible for managing suspended coroutines: * `VlCoroutineHandle`, a wrapper over C++20's `std::coroutine_handle` with move semantics and automatic cleanup. * `VlDelayScheduler`, for coroutines suspended by delays. It resumes them at a proper simulation time. * `VlTriggerScheduler`, for coroutines suspended by event controls. It resumes them if its corresponding trigger was set. * `VlForkSync`, used for syncing `fork..join` and `fork..join_any` blocks. * `VlCoroutine`, the return type of all verilated coroutines. It allows for suspending a stack of coroutines (normally, C++ coroutines are stackless). There is a new visitor in `V3Timing.cpp` which: * scales delays according to the timescale, * simplifies intra-assignment timing controls and net delays into regular timing controls and assignments, * simplifies wait statements into loops with event controls, * marks processes and tasks with timing controls in them as suspendable, * creates delay, trigger scheduler, and fork sync variables, * transforms timing controls and fork joins into C++ awaits There are new functions in `V3SchedTiming.cpp` (used by `V3Sched.cpp`) that integrate static scheduling with timing. This involves providing external domains for variables, so that the necessary combinational logic gets triggered after coroutine resumption, as well as statements that need to be injected into the design eval function to perform this resumption at the correct time. There is also a function that transforms forked processes into separate functions. See the comments in `verilated_timing.h`, `verilated_timing.cpp`, `V3Timing.cpp`, and `V3SchedTiming.cpp`, as well as the internals documentation for more details. Signed-off-by: Krzysztof Bieganski <kbieganski@antmicro.com>
2022-08-22 14:26:32 +02:00
2026-02-11 19:35:59 +01:00
// Step 15: Add neccessary evaluation before awaits
if (AstCCall* const readyp = timingKit.createReady(netlistp)) {
staticp->addStmtsp(readyp->makeStmt());
beforeTrigVisitor(netlistp, senExprBuilder, trigKit);
} else {
// beforeTrigVisitor clears Sentree pointers in AstCAwaits (as these sentrees will get
// deleted later) if there was no need to call it, SenTrees have to be cleaned manually
netlistp->foreach([](AstCAwait* const cAwaitp) { cAwaitp->clearSentreep(); });
}
if (AstVarScope* const trigAccp = trigKit.vscAccp()) {
// Copy trigger vector to accumulator at the end of static initialziation so,
// triggers fired during initialization persist to the first resume.
const AstUnpackArrayDType* const trigAccDTypep
= VN_AS(trigAccp->dtypep(), UnpackArrayDType);
UASSERT_OBJ(
trigAccDTypep->right() == 0, trigAccp,
"Expected that trigger vector and accumulator start elements enumeration from 0");
UASSERT_OBJ(trigAccDTypep->left() >= 0, trigAccp,
"Expected that trigger vector and accumulator has no negative indexes");
FileLine* const flp = trigAccp->fileline();
AstVarScope* const vscp = netlistp->topScopep()->scopep()->createTemp("__Vi", 32);
AstLoop* const loopp = new AstLoop{flp};
loopp->addStmtsp(
new AstAssign{flp,
new AstArraySel{flp, new AstVarRef{flp, trigAccp, VAccess::WRITE},
new AstVarRef{flp, vscp, VAccess::READ}},
new AstArraySel{flp, new AstVarRef{flp, actKit.m_vscp, VAccess::READ},
new AstVarRef{flp, vscp, VAccess::READ}}});
loopp->addStmtsp(util::incrementVar(vscp));
loopp->addStmtsp(new AstLoopTest{
flp, loopp,
new AstLte{flp, new AstVarRef{flp, vscp, VAccess::READ},
new AstConst{flp, AstConst::WidthedValue{}, 32,
static_cast<uint32_t>(trigAccDTypep->left())}}});
staticp->addStmtsp(loopp);
}
// Step 16: Clean up
2026-01-23 18:53:40 +01:00
netlistp->clearStlFirstIterationp();
// Haven't split static initializer yet
util::splitCheck(staticp);
// Dump
V3Global::dumpCheckGlobalTree("sched", 0, dumpTreeEitherLevel() >= 3);
}
} // namespace V3Sched