diff --git a/bin/verilator_gantt b/bin/verilator_gantt index 94f52870d..b6a0a4d31 100755 --- a/bin/verilator_gantt +++ b/bin/verilator_gantt @@ -8,16 +8,17 @@ import collections import math import re import statistics +from collections import OrderedDict # from pprint import pprint -Sections = [] +Sections = OrderedDict() LongestVcdStrValueLength = 0 Threads = collections.defaultdict(lambda: []) # List of records per thread id Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0}) Cpus = collections.defaultdict(lambda: {'mtask_time': 0}) Global = {'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), 'stats': {}} ElapsedTime = None # total elapsed time -ExecGraphTime = 0 # total elapsed time excuting an exec graph +ExecGraphTime = 0 # total elapsed time executing an exec graph ExecGraphIntervals = [] # list of (start, end) pairs ###################################################################### @@ -37,11 +38,11 @@ def read_data(filename): re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None thread = 0 - execGraphStart = None global LongestVcdStrValueLength global ExecGraphTime + ExecGraphStack = [] SectionStack = [] mTaskThread = {} @@ -88,16 +89,17 @@ def read_data(filename): Mtasks[mtask]['predict_cost'] = predict_cost Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick) elif kind == "EXEC_GRAPH_BEGIN": - execGraphStart = tick + ExecGraphStack.append(tick) elif kind == "EXEC_GRAPH_END": + assert ExecGraphStack, "EXEC_GRAPH_END without EXEC_GRAPH_BEGIN" + execGraphStart = ExecGraphStack.pop() ExecGraphTime += tick - execGraphStart ExecGraphIntervals.append((execGraphStart, tick)) - execGraphStart = None elif Args.debug: print("-Unknown execution trace record: %s" % line) elif re_thread.match(line): thread = int(re_thread.match(line).group(1)) - Sections.append([]) + Sections[thread] = [] elif re.match(r'^VLPROF(THREAD|VERSION)', line): pass elif re_arg1.match(line): @@ -308,7 +310,7 @@ def report_cpus(): def report_sections(): - for thread, section in enumerate(Sections): + for thread, section in Sections.items(): if section: print(f"\nSection profile for thread {thread}:") report_section(section) @@ -432,7 +434,10 @@ def write_vcd(filename): # Find the earliest MTask start after the start point, and the # latest MTask end before the end point, so we can scale to the # same range - start = tStart[bisect.bisect_left(tStart, start)] + tStartIdx = bisect.bisect_left(tStart, start) + if tStartIdx >= len(tStart): + continue + start = tStart[tStartIdx] end = tEnd[bisect.bisect_right(tEnd, end) - 1] # Compute scale so predicted graph is of same width as interval measured_scaling = (end - start) / Global['predict_last_end'] @@ -462,7 +467,7 @@ def write_vcd(filename): addValue(pcode, time, value) # Section graph - for thread, section in enumerate(Sections): + for thread, section in Sections.items(): if section: scode = getCode(LongestVcdStrValueLength * 8, "section", f"t{thread}_trace") dcode = getCode(32, "section", f"t{thread}_depth") diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 44c9dd6ec..89d8a990a 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -2124,6 +2124,20 @@ The grammar of configuration commands is as follows: This option should not be used directly. See :ref:`Hierarchical Verilation`. +.. option:: hier_workers -hier-dpi "" -workers + + Specifies how many threads need to be used for scheduling hierarchical DPI + tasks. This data is inserted internally during :vlopt:`--hierarchical`, + based on value specified in :option:`hier_workers -module`. This option + should not be used directly. See :ref:`Hierarchical Verilation`. + +.. option:: hier_workers -module "" -workers + + Specifies how many threads need to be used for scheduling given module with + :option:`/*verilator&32;hier_block*/` metacomment. This number needs to be + smaller than :vlopt:`--threads` to fit in a thread schedule. + See :ref:`Hierarchical Verilation`. + .. option:: inline -module "" Specifies the module may be inlined into any modules that use this diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp index 4c048e04e..9f36f85e5 100644 --- a/include/verilated_threads.cpp +++ b/include/verilated_threads.cpp @@ -100,7 +100,10 @@ void VlWorkerThread::startWorker(VlWorkerThread* workerp, VerilatedContext* cont // VlThreadPool VlThreadPool::VlThreadPool(VerilatedContext* contextp, unsigned nThreads) { - for (unsigned i = 0; i < nThreads; ++i) m_workers.push_back(new VlWorkerThread{contextp}); + for (unsigned i = 0; i < nThreads; ++i) { + m_workers.push_back(new VlWorkerThread{contextp}); + m_unassignedWorkers.push(i); + } } VlThreadPool::~VlThreadPool() { diff --git a/include/verilated_threads.h b/include/verilated_threads.h index b0e40d3b5..a2722b164 100644 --- a/include/verilated_threads.h +++ b/include/verilated_threads.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -205,6 +206,13 @@ class VlThreadPool final : public VerilatedVirtualBase { // MEMBERS std::vector m_workers; // our workers + // Guards indexes of unassigned workers + mutable VerilatedMutex m_mutex; + // Indexes of unassigned workers + std::stack m_unassignedWorkers VL_GUARDED_BY(m_mutex); + // Used for sequentially generating task IDs to avoid shadowing + std::atomic m_assignedTasks{0}; + public: // CONSTRUCTORS // Construct a thread pool with 'nThreads' dedicated threads. The thread @@ -214,6 +222,19 @@ public: ~VlThreadPool() override; // METHODS + size_t assignWorkerIndex() { + const VerilatedLockGuard lock{m_mutex}; + assert(!m_unassignedWorkers.empty()); + const size_t index = m_unassignedWorkers.top(); + m_unassignedWorkers.pop(); + return index; + } + void freeWorkerIndexes(std::vector& indexes) { + const VerilatedLockGuard lock{m_mutex}; + for (size_t index : indexes) m_unassignedWorkers.push(index); + indexes.clear(); + } + unsigned assignTaskIndex() { return m_assignedTasks++; } int numThreads() const { return static_cast(m_workers.size()); } VlWorkerThread* workerp(int index) { assert(index >= 0); diff --git a/src/V3Config.cpp b/src/V3Config.cpp index f637cacd3..59d7460cc 100644 --- a/src/V3Config.cpp +++ b/src/V3Config.cpp @@ -540,6 +540,8 @@ class V3ConfigResolver final { std::unordered_map> m_profileData; // Access to profile_data records uint8_t m_mode = NONE; + std::unordered_map m_hierWorkers; + FileLine* m_hierWorkersFileLine = nullptr; FileLine* m_profileFileLine = nullptr; V3ConfigResolver() = default; @@ -570,6 +572,16 @@ public: // Empty key for hierarchical DPI wrapper costs. return getProfileData(hierDpi, ""); } + void addHierWorkers(FileLine* fl, const string& model, int workers) { + if (!m_hierWorkersFileLine) m_hierWorkersFileLine = fl; + m_hierWorkers[model] = workers; + } + int getHierWorkers(const string& model) const { + const auto mit = m_hierWorkers.find(model); + // Assign a single worker if no specified. + return mit != m_hierWorkers.cend() ? mit->second : 0; + } + FileLine* getHierWorkersFileLine() const { return m_hierWorkersFileLine; } uint64_t getProfileData(const string& model, const string& key) const { const auto mit = m_profileData.find(model); if (mit == m_profileData.cend()) return 0; @@ -602,6 +614,10 @@ void V3Config::addCoverageBlockOff(const string& module, const string& blockname V3ConfigResolver::s().modules().at(module).addCoverageBlockOff(blockname); } +void V3Config::addHierWorkers(FileLine* fl, const string& model, int workers) { + V3ConfigResolver::s().addHierWorkers(fl, model, workers); +} + void V3Config::addIgnore(V3ErrorCode code, bool on, const string& filename, int min, int max) { if (filename == "*") { FileLine::globalWarnOff(code, !on); @@ -741,6 +757,12 @@ void V3Config::applyVarAttr(AstNodeModule* modulep, AstNodeFTask* ftaskp, AstVar if (vp) vp->apply(varp); } +int V3Config::getHierWorkers(const string& model) { + return V3ConfigResolver::s().getHierWorkers(model); +} +FileLine* V3Config::getHierWorkersFileLine() { + return V3ConfigResolver::s().getHierWorkersFileLine(); +} uint64_t V3Config::getProfileData(const string& hierDpi) { return V3ConfigResolver::s().getProfileData(hierDpi); } diff --git a/src/V3Config.h b/src/V3Config.h index df7287c07..34b1aad55 100644 --- a/src/V3Config.h +++ b/src/V3Config.h @@ -33,6 +33,7 @@ public: static void addCaseParallel(const string& file, int lineno); static void addCoverageBlockOff(const string& file, int lineno); static void addCoverageBlockOff(const string& module, const string& blockname); + static void addHierWorkers(FileLine* fl, const string& model, int workers); static void addIgnore(V3ErrorCode code, bool on, const string& filename, int min, int max); static void addIgnoreMatch(V3ErrorCode code, const string& filename, const string& contents, const string& match); @@ -52,6 +53,8 @@ public: static void applyModule(AstNodeModule* modulep); static void applyVarAttr(AstNodeModule* modulep, AstNodeFTask* ftaskp, AstVar* varp); + static int getHierWorkers(const string& model); + static FileLine* getHierWorkersFileLine(); static uint64_t getProfileData(const string& hierDpi); static uint64_t getProfileData(const string& model, const string& key); static FileLine* getProfileDataFileLine(); diff --git a/src/V3ExecGraph.cpp b/src/V3ExecGraph.cpp index 4ca91904a..25bd7d864 100644 --- a/src/V3ExecGraph.cpp +++ b/src/V3ExecGraph.cpp @@ -63,6 +63,10 @@ namespace V3ExecGraph { class ThreadSchedule final { friend class PackThreads; + uint32_t m_id; // Unique ID of a schedule + static uint32_t s_nextId; // Next ID number to use + std::unordered_set mtasks; // Mtasks in this schedule + public: // CONSTANTS static constexpr uint32_t UNASSIGNED = 0xffffffff; @@ -79,21 +83,18 @@ public: // the sequence of MTasks to be executed by that thread. std::vector> threads; - // State for each mtask. - std::unordered_map mtaskState; + // Global state for each mtask. + static std::unordered_map mtaskState; - uint32_t threadId(const ExecMTask* mtaskp) const { - const auto& it = mtaskState.find(mtaskp); - return it != mtaskState.end() ? it->second.threadId : UNASSIGNED; - } - -private: explicit ThreadSchedule(uint32_t nThreads) - : threads{nThreads} {} - VL_UNCOPYABLE(ThreadSchedule); // But movable + : m_id(s_nextId++) + , threads{nThreads} {} ThreadSchedule(ThreadSchedule&&) = default; ThreadSchedule& operator=(ThreadSchedule&&) = default; +private: + VL_UNCOPYABLE(ThreadSchedule); + // Debugging void dumpDotFile(const V3Graph& graph, const string& filename) const { // This generates a file used by graphviz, https://www.graphviz.org @@ -168,6 +169,17 @@ private: } public: + static uint32_t threadId(const ExecMTask* mtaskp) { + const auto& it = mtaskState.find(mtaskp); + return it != mtaskState.end() ? it->second.threadId : UNASSIGNED; + } + static uint32_t startTime(const ExecMTask* mtaskp) { + return mtaskState.at(mtaskp).completionTime - mtaskp->cost(); + } + static uint32_t endTime(const ExecMTask* mtaskp) { + return mtaskState.at(mtaskp).completionTime; + } + // Returns the number of cross-thread dependencies of the given MTask. If > 0, the MTask must // test whether its dependencies are ready before starting, and therefore may need to block. uint32_t crossThreadDependencies(const ExecMTask* mtaskp) const { @@ -175,19 +187,32 @@ public: uint32_t result = 0; for (const V3GraphEdge& edge : mtaskp->inEdges()) { const ExecMTask* const prevp = edge.fromp()->as(); - if (threadId(prevp) != thisThreadId) ++result; + if (threadId(prevp) != thisThreadId && contains(prevp)) ++result; } return result; } - uint32_t startTime(const ExecMTask* mtaskp) const { - return mtaskState.at(mtaskp).completionTime - mtaskp->cost(); - } - uint32_t endTime(const ExecMTask* mtaskp) const { - return mtaskState.at(mtaskp).completionTime; + uint32_t id() const { return m_id; } + uint32_t scheduleOn(const ExecMTask* mtaskp, uint32_t bestThreadId) { + mtasks.emplace(mtaskp); + const uint32_t bestEndTime = mtaskp->predictStart() + mtaskp->cost(); + mtaskState[mtaskp].completionTime = bestEndTime; + mtaskState[mtaskp].threadId = bestThreadId; + + // Reference to thread in schedule we are assigning this MTask to. + std::vector& bestThread = threads[bestThreadId]; + if (!bestThread.empty()) mtaskState[bestThread.back()].nextp = mtaskp; + + // Add the MTask to the schedule + bestThread.push_back(mtaskp); + return bestEndTime; } + bool contains(const ExecMTask* mtaskp) const { return mtasks.count(mtaskp); } }; +uint32_t ThreadSchedule::s_nextId = 0; +std::unordered_map ThreadSchedule::mtaskState{}; + //###################################################################### // PackThreads @@ -260,7 +285,7 @@ class PackThreads final { return sandbaggedEndTime; } - bool isReady(ThreadSchedule& schedule, const ExecMTask* mtaskp) { + static bool isReady(ThreadSchedule& schedule, const ExecMTask* mtaskp) { for (const V3GraphEdge& edgeIn : mtaskp->inEdges()) { const ExecMTask* const prevp = edgeIn.fromp()->as(); if (schedule.threadId(prevp) == ThreadSchedule::UNASSIGNED) { @@ -272,20 +297,39 @@ class PackThreads final { } // Pack an MTasks from given graph into m_nThreads threads, return the schedule. - ThreadSchedule pack(V3Graph& mtaskGraph) { - // The result - ThreadSchedule schedule{m_nThreads}; + std::vector pack(V3Graph& mtaskGraph) { + std::vector result; + result.emplace_back(ThreadSchedule{m_nThreads}); + + // To support scheduling tasks that utilize more than one thread, we introduce a wide + // task (ExecMTask with threads() > 1). Those tasks are scheduled on a separate thread + // schedule to ensure that indexes for simulation-time thread pool workers are not shadowed + // by another tasks. + // For retaining control over thread schedules, we distinguish SchedulingModes: + enum class SchedulingMode { + SCHEDULING // Schedule normal tasks + , + WIDE_TASK_DISCOVERED // We found a wide task, if this is the only one available, + // switch to WIDE_TASK_SCHEDULING + , + WIDE_TASK_SCHEDULING // Schedule wide tasks + }; + SchedulingMode mode = SchedulingMode::SCHEDULING; // Time each thread is occupied until std::vector busyUntil(m_nThreads, 0); // MTasks ready to be assigned next. All their dependencies are already assigned. std::set readyMTasks; + int maxThreadWorkers = 1; // Build initial ready list for (V3GraphVertex& vtx : mtaskGraph.vertices()) { ExecMTask* const mtaskp = vtx.as(); - if (isReady(schedule, mtaskp)) readyMTasks.insert(mtaskp); + if (isReady(result.back(), mtaskp)) readyMTasks.insert(mtaskp); + // TODO right now we schedule tasks assuming they take the same number of threads for + // simplification. + maxThreadWorkers = std::max(maxThreadWorkers, mtaskp->threads()); } while (!readyMTasks.empty()) { @@ -294,8 +338,16 @@ class PackThreads final { uint32_t bestTime = 0xffffffff; uint32_t bestThreadId = 0; ExecMTask* bestMtaskp = nullptr; // Todo: const ExecMTask* - for (uint32_t threadId = 0; threadId < m_nThreads; ++threadId) { + ThreadSchedule& schedule = result.back(); + for (uint32_t threadId = 0; threadId < schedule.threads.size(); ++threadId) { for (ExecMTask* const mtaskp : readyMTasks) { + if (mode != SchedulingMode::WIDE_TASK_SCHEDULING && mtaskp->threads() > 1) { + mode = SchedulingMode::WIDE_TASK_DISCOVERED; + continue; + } + if (mode == SchedulingMode::WIDE_TASK_SCHEDULING && mtaskp->threads() <= 1) + continue; + uint32_t timeBegin = busyUntil[threadId]; if (timeBegin > bestTime) { UINFO(6, "th " << threadId << " busy until " << timeBegin @@ -321,23 +373,44 @@ class PackThreads final { } } + if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_DISCOVERED) { + mode = SchedulingMode::WIDE_TASK_SCHEDULING; + const uint32_t size = m_nThreads / maxThreadWorkers; + UASSERT(size, "Thread pool size should be bigger than 0"); + // If no tasks were added to the normal thread schedule, remove it. + if (schedule.mtaskState.empty()) result.erase(result.begin()); + result.emplace_back(ThreadSchedule{size}); + continue; + } + + if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_SCHEDULING) { + mode = SchedulingMode::SCHEDULING; + if (!schedule.mtaskState.empty()) result.emplace_back(ThreadSchedule{m_nThreads}); + continue; + } + UASSERT(bestMtaskp, "Should have found some task"); - UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << bestThreadId - << endl); - // Reference to thread in schedule we are assigning this MTask to. - std::vector& bestThread = schedule.threads[bestThreadId]; + bestMtaskp->predictStart(bestTime); + const uint32_t bestEndTime = schedule.scheduleOn(bestMtaskp, bestThreadId); - // Update algorithm state - bestMtaskp->predictStart(bestTime); // Only for gantt reporting - const uint32_t bestEndTime = bestTime + bestMtaskp->cost(); - schedule.mtaskState[bestMtaskp].completionTime = bestEndTime; - schedule.mtaskState[bestMtaskp].threadId = bestThreadId; - if (!bestThread.empty()) schedule.mtaskState[bestThread.back()].nextp = bestMtaskp; - busyUntil[bestThreadId] = bestEndTime; - - // Add the MTask to the schedule - bestThread.push_back(bestMtaskp); + // Populate busyUntil timestamps. For multi-worker tasks, set timestamps for + // offsetted threads. + if (mode != SchedulingMode::WIDE_TASK_SCHEDULING) { + busyUntil[bestThreadId] = bestEndTime; + } else { + for (int i = 0; i < maxThreadWorkers; ++i) { + const size_t threadId = bestThreadId + (i * schedule.threads.size()); + UASSERT(threadId < busyUntil.size(), + "Incorrect busyUntil offset: threadId=" + cvtToStr(threadId) + + " bestThreadId=" + cvtToStr(bestThreadId) + " i=" + cvtToStr(i) + + " schedule-size=" + cvtToStr(schedule.threads.size()) + + " maxThreadWorkers=" + cvtToStr(maxThreadWorkers)); + busyUntil[threadId] = bestEndTime; + UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << threadId + << endl); + } + } // Update the ready list const size_t erased = readyMTasks.erase(bestMtaskp); @@ -357,9 +430,10 @@ class PackThreads final { } } - if (dumpGraphLevel() >= 4) schedule.dumpDotFilePrefixedAlways(mtaskGraph, "schedule"); + // All schedules are combined on a single graph + if (dumpGraphLevel() >= 4) result.back().dumpDotFilePrefixedAlways(mtaskGraph, "schedule"); - return schedule; + return result; } public: @@ -383,49 +457,78 @@ public: ExecMTask* const t2 = new ExecMTask{&graph, makeBody()}; t2->cost(100); t2->priority(100); + t2->threads(2); + ExecMTask* const t3 = new ExecMTask{&graph, makeBody()}; + t3->cost(100); + t3->priority(100); + t3->threads(3); + ExecMTask* const t4 = new ExecMTask{&graph, makeBody()}; + t4->cost(100); + t4->priority(100); + t4->threads(3); + /* + 0 + / \ + 1 2 + / \ + 3 4 + */ new V3GraphEdge{&graph, t0, t1, 1}; new V3GraphEdge{&graph, t0, t2, 1}; + new V3GraphEdge{&graph, t2, t3, 1}; + new V3GraphEdge{&graph, t2, t4, 1}; - PackThreads packer{2, // Threads + constexpr uint32_t threads = 6; + PackThreads packer{threads, 3, // Sandbag numerator 10}; // Sandbag denom - const ThreadSchedule& schedule = packer.pack(graph); - UASSERT_SELFTEST(size_t, schedule.threads.size(), 2); + const std::vector scheduled = packer.pack(graph); + UASSERT_SELFTEST(size_t, scheduled[0].threads.size(), threads); + UASSERT_SELFTEST(size_t, scheduled[0].threads[0].size(), 2); + for (size_t i = 1; i < scheduled[0].threads.size(); ++i) + UASSERT_SELFTEST(size_t, scheduled[0].threads[i].size(), 0); - UASSERT_SELFTEST(size_t, schedule.threads[0].size(), 2); - UASSERT_SELFTEST(size_t, schedule.threads[1].size(), 1); + UASSERT_SELFTEST(const ExecMTask*, scheduled[0].threads[0][0], t0); + UASSERT_SELFTEST(const ExecMTask*, scheduled[0].threads[0][1], t1); - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][0], t0); - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[0][1], t1); - UASSERT_SELFTEST(const ExecMTask*, schedule.threads[1][0], t2); + UASSERT_SELFTEST(size_t, scheduled[1].threads.size(), threads / 3); + UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[1][0], t2); + UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[1][1], t3); + UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[0][0], t4); - UASSERT_SELFTEST(size_t, schedule.mtaskState.size(), 3); + UASSERT_SELFTEST(size_t, ThreadSchedule::mtaskState.size(), 5); - UASSERT_SELFTEST(uint32_t, schedule.threadId(t0), 0); - UASSERT_SELFTEST(uint32_t, schedule.threadId(t1), 0); - UASSERT_SELFTEST(uint32_t, schedule.threadId(t2), 1); + UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t0), 0); + UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t1), 0); + UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t2), 1); + UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t3), 1); + UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t4), 0); // On its native thread, we see the actual end time for t0: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 0), 1000); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t0, 0), 1000); // On the other thread, we see a sandbagged end time which does not // exceed the t1 end time: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t0, 1), 1099); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t0, 1), 1099); // Actual end time on native thread: - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 0), 1100); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t1, 0), 1100); // Sandbagged end time seen on thread 1. Note it does not compound // with t0's sandbagged time; compounding caused trouble in // practice. - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t1, 1), 1130); - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 0), 1229); - UASSERT_SELFTEST(uint32_t, packer.completionTime(schedule, t2, 1), 1199); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t1, 1), 1130); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 0), 1229); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 1), 1199); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 0), 1329); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 1), 1299); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 0), 1329); + UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 1), 1359); for (AstNode* const nodep : mTaskBodyps) nodep->deleteTree(); } - static const ThreadSchedule apply(V3Graph& mtaskGraph) { + static std::vector apply(V3Graph& mtaskGraph) { return PackThreads{}.pack(mtaskGraph); } }; @@ -644,7 +747,7 @@ void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId, // For any dependent mtask that's on another thread, signal one dependency completion. for (const V3GraphEdge& edge : mtaskp->outEdges()) { const ExecMTask* const nextp = edge.top()->as(); - if (schedule.threadId(nextp) != threadId) { + if (schedule.threadId(nextp) != threadId && schedule.contains(nextp)) { addStrStmt("vlSelf->__Vm_mtaskstate_" + cvtToStr(nextp->id()) + ".signalUpstreamDone(even_cycle);\n"); } @@ -662,7 +765,8 @@ const std::vector createThreadFunctions(const ThreadSchedule& schedul for (const std::vector& thread : schedule.threads) { if (thread.empty()) continue; const uint32_t threadId = schedule.threadId(thread.front()); - const string name{"__Vthread__" + tag + "__" + cvtToStr(threadId)}; + const string name{"__Vthread__" + tag + "__t" + cvtToStr(threadId) + "__s" + + cvtToStr(schedule.id())}; AstCFunc* const funcp = new AstCFunc{fl, name, nullptr, "void"}; modp->addStmtsp(funcp); funcps.push_back(funcp); @@ -681,7 +785,8 @@ const std::vector createThreadFunctions(const ThreadSchedule& schedul } // Unblock the fake "final" mtask when this thread is finished - funcp->addStmtsp(new AstCStmt{fl, "vlSelf->__Vm_mtaskstate_final__" + tag + funcp->addStmtsp(new AstCStmt{fl, "vlSelf->__Vm_mtaskstate_final__" + + cvtToStr(schedule.id()) + tag + ".signalUpstreamDone(even_cycle);\n"}); } @@ -689,7 +794,8 @@ const std::vector createThreadFunctions(const ThreadSchedule& schedul AstBasicDType* const mtaskStateDtypep = v3Global.rootp()->typeTablep()->findBasicDType(fl, VBasicDTypeKwd::MTASKSTATE); AstVar* const varp - = new AstVar{fl, VVarType::MODULETEMP, "__Vm_mtaskstate_final__" + tag, mtaskStateDtypep}; + = new AstVar{fl, VVarType::MODULETEMP, + "__Vm_mtaskstate_final__" + cvtToStr(schedule.id()) + tag, mtaskStateDtypep}; varp->valuep(new AstConst(fl, funcps.size())); varp->protect(false); // Do not protect as we still have references in AstText modp->addStmtsp(varp); @@ -697,8 +803,40 @@ const std::vector createThreadFunctions(const ThreadSchedule& schedul return funcps; } +void addThreadStartWrapper(AstExecGraph* const execGraphp) { + // FileLine used for constructing nodes below + FileLine* const fl = v3Global.rootp()->fileline(); + const string& tag = execGraphp->name(); + + // Add thread function invocations to execGraph + const auto addStrStmt = [=](const string& stmt) -> void { // + execGraphp->addStmtsp(new AstCStmt{fl, stmt}); + }; + + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n"); + } + + addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag + + ";\n"); + + if (!v3Global.opt.hierBlocks().empty()) addStrStmt("std::vector indexes;\n"); +} + +void addThreadEndWrapper(AstExecGraph* const execGraphp) { + // Add thread function invocations to execGraph + const auto addStrStmt = [=](const string& stmt) -> void { // + FileLine* const flp = v3Global.rootp()->fileline(); + execGraphp->addStmtsp(new AstCStmt{flp, stmt}); + }; + + addStrStmt("Verilated::mtaskId(0);\n"); + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n"); + } +} void addThreadStartToExecGraph(AstExecGraph* const execGraphp, - const std::vector& funcps) { + const std::vector& funcps, uint32_t scheduleId) { // FileLine used for constructing nodes below FileLine* const fl = v3Global.rootp()->fileline(); const string& tag = execGraphp->name(); @@ -711,19 +849,22 @@ void addThreadStartToExecGraph(AstExecGraph* const execGraphp, execGraphp->addStmtsp(new AstText{fl, text, /* tracking: */ true}); }; - if (v3Global.opt.profExec()) { - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphBegin();\n"); - } - - addStrStmt("vlSymsp->__Vm_even_cycle__" + tag + " = !vlSymsp->__Vm_even_cycle__" + tag - + ";\n"); - const uint32_t last = funcps.size() - 1; - for (uint32_t i = 0; i <= last; ++i) { - AstCFunc* const funcp = funcps.at(i); + if (!v3Global.opt.hierBlocks().empty() && last > 0) { + addStrStmt( + "for (size_t i = 0; i < " + cvtToStr(last) + + "; ++i) indexes.push_back(vlSymsp->__Vm_threadPoolp->assignWorkerIndex());\n"); + } + uint32_t i = 0; + for (AstCFunc* const funcp : funcps) { if (i != last) { // The first N-1 will run on the thread pool. - addTextStmt("vlSymsp->__Vm_threadPoolp->workerp(" + cvtToStr(i) + ")->addTask("); + if (v3Global.opt.hierChild() || !v3Global.opt.hierBlocks().empty()) { + addTextStmt("vlSymsp->__Vm_threadPoolp->workerp(indexes[" + cvtToStr(i) + + "])->addTask("); + } else { + addTextStmt("vlSymsp->__Vm_threadPoolp->workerp(" + cvtToStr(i) + ")->addTask("); + } execGraphp->addStmtsp(new AstAddrOfCFunc{fl, funcp}); addTextStmt(", vlSelf, vlSymsp->__Vm_even_cycle__" + tag + ");\n"); } else { @@ -732,15 +873,16 @@ void addThreadStartToExecGraph(AstExecGraph* const execGraphp, callp->dtypeSetVoid(); callp->argTypes("vlSelf, vlSymsp->__Vm_even_cycle__" + tag); execGraphp->addStmtsp(callp->makeStmt()); - addStrStmt("Verilated::mtaskId(0);\n"); } + ++i; } + V3Stats::addStatSum("Optimizations, Thread schedule total tasks", i); - addStrStmt("vlSelf->__Vm_mtaskstate_final__" + tag + addStrStmt("vlSelf->__Vm_mtaskstate_final__" + std::to_string(scheduleId) + tag + ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n"); - - if (v3Global.opt.profExec()) { - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).execGraphEnd();\n"); + // Free all assigned worker indices in this section + if (!v3Global.opt.hierBlocks().empty() && last > 0) { + addStrStmt("vlSymsp->__Vm_threadPoolp->freeWorkerIndexes(indexes);\n"); } } @@ -762,15 +904,22 @@ void wrapMTaskBodies(AstExecGraph* const execGraphp) { funcp->addStmtsp(new AstCStmt{flp, stmt}); }; - if (v3Global.opt.profExec()) { + if (v3Global.opt.hierChild() || !v3Global.opt.hierBlocks().empty()) { + addStrStmt( + "static const unsigned taskId = vlSymsp->__Vm_threadPoolp->assignTaskIndex();\n"); + } else { const string& id = std::to_string(mtaskp->id()); + addStrStmt("static constexpr unsigned taskId = " + id + ";\n"); + } + + if (v3Global.opt.profExec() && mtaskp->threads() <= 1) { const string& predictStart = std::to_string(mtaskp->predictStart()); - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(" + id + ", " + predictStart + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskBegin(taskId, " + predictStart + ");\n"); } // Set mtask ID in the run-time system - addStrStmt("Verilated::mtaskId(" + std::to_string(mtaskp->id()) + ");\n"); + addStrStmt("Verilated::mtaskId(taskId);\n"); // Run body funcp->addStmtsp(mtaskBodyp->stmtsp()->unlinkFrBackWithNext()); @@ -778,10 +927,9 @@ void wrapMTaskBodies(AstExecGraph* const execGraphp) { // Flush message queue addStrStmt("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n"); - if (v3Global.opt.profExec()) { - const string& id = std::to_string(mtaskp->id()); + if (v3Global.opt.profExec() && mtaskp->threads() <= 1) { const string& predictConst = std::to_string(mtaskp->cost()); - addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(" + id + ", " + predictConst + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).mtaskEnd(taskId, " + predictConst + ");\n"); } @@ -803,7 +951,7 @@ void implementExecGraph(AstExecGraph* const execGraphp, const ThreadSchedule& sc UASSERT(!funcps.empty(), "Non-empty ExecGraph yields no threads?"); // Start the thread functions at the point this AstExecGraph is located in the tree. - addThreadStartToExecGraph(execGraphp, funcps); + addThreadStartToExecGraph(execGraphp, funcps, schedule.id()); } void implement(AstNetlist* netlistp) { @@ -817,15 +965,25 @@ void implement(AstNetlist* netlistp) { fillinCosts(execGraphp->depGraphp()); finalizeCosts(execGraphp->depGraphp()); + if (dumpGraphLevel() >= 4) execGraphp->depGraphp()->dumpDotFilePrefixedAlways("pack"); + + addThreadStartWrapper(execGraphp); + // Schedule the mtasks: statically associate each mtask with a thread, // and determine the order in which each thread will run its mtasks. - const ThreadSchedule& schedule = PackThreads::apply(*execGraphp->depGraphp()); + const std::vector packed = PackThreads::apply(*execGraphp->depGraphp()); + V3Stats::addStatSum("Optimizations, Thread schedule count", + static_cast(packed.size())); // Wrap each MTask body into a CFunc for better profiling/debugging wrapMTaskBodies(execGraphp); - // Replace the graph body with its multi-threaded implementation. - implementExecGraph(execGraphp, schedule); + for (const ThreadSchedule& schedule : packed) { + // Replace the graph body with its multi-threaded implementation. + implementExecGraph(execGraphp, schedule); + } + + addThreadEndWrapper(execGraphp); }); } diff --git a/src/V3ExecGraph.h b/src/V3ExecGraph.h index fd4baa257..d16941b5c 100644 --- a/src/V3ExecGraph.h +++ b/src/V3ExecGraph.h @@ -43,6 +43,7 @@ private: // Predicted runtime of this mtask, in the same abstract time units as priority(). uint32_t m_cost = 0; uint64_t m_predictStart = 0; // Predicted start time of task + int m_threads = 1; // Threads used by this mtask VL_UNCOPYABLE(ExecMTask); public: @@ -57,6 +58,8 @@ public: void predictStart(uint64_t time) { m_predictStart = time; } string name() const override VL_MT_STABLE { return "mt"s + std::to_string(id()); } string hashName() const { return m_hashName; } + void threads(int threads) { m_threads = threads; } + int threads() const { return m_threads; } void dump(std::ostream& str) const; static uint32_t numUsedIds() VL_MT_SAFE { return s_nextId; } diff --git a/src/V3HierBlock.cpp b/src/V3HierBlock.cpp index 244d66897..f4a689ff9 100644 --- a/src/V3HierBlock.cpp +++ b/src/V3HierBlock.cpp @@ -88,6 +88,7 @@ #include "V3HierBlock.h" +#include "V3Config.h" #include "V3EmitV.h" #include "V3File.h" #include "V3Os.h" @@ -188,6 +189,23 @@ V3StringList V3HierBlock::commandArgs(bool forCMake) const { if (!params().gTypeParams().empty()) opts.push_back(" --hierarchical-params-file " + typeParametersFilename()); + const int blockThreads = V3Config::getHierWorkers(m_modp->origName()); + if (blockThreads > 1) { + if (hasParent()) { + V3Config::getHierWorkersFileLine()->v3warn( + E_UNSUPPORTED, "Specifying workers for nested hierarchical blocks"); + } else { + if (v3Global.opt.threads() < blockThreads) { + m_modp->v3error("Hierarchical blocks cannot be scheduled on more threads than in " + "thread pool, threads = " + << v3Global.opt.threads() + << " hierarchical block threads = " << blockThreads); + } + + opts.push_back(" --threads " + std::to_string(blockThreads)); + } + } + return opts; } diff --git a/src/V3HierBlock.h b/src/V3HierBlock.h index ff6e52fe3..b06b9639c 100644 --- a/src/V3HierBlock.h +++ b/src/V3HierBlock.h @@ -92,6 +92,7 @@ public: ~V3HierBlock() VL_MT_DISABLED; void addParent(V3HierBlock* parentp) { m_parents.insert(parentp); } + bool hasParent() const { return !m_parents.empty(); } void addChild(V3HierBlock* childp) { m_children.insert(childp); } bool hasChild() const { return !m_children.empty(); } const HierBlockSet& parents() const { return m_parents; } diff --git a/src/V3OrderParallel.cpp b/src/V3OrderParallel.cpp index 72473d71e..aca1ac329 100644 --- a/src/V3OrderParallel.cpp +++ b/src/V3OrderParallel.cpp @@ -1737,6 +1737,34 @@ private: VL_UNCOPYABLE(DpiImportCallVisitor); }; +//###################################################################### +// DpiThreadsVisitor + +// Get number of threads occupied by this mtask +class DpiThreadsVisitor final : public VNVisitorConst { + int m_threads = 1; // Max number of threads used by this mtask + + // METHODS + void visit(AstCFunc* nodep) override { + m_threads = std::max(m_threads, V3Config::getHierWorkers(nodep->cname())); + iterateChildrenConst(nodep); + } + void visit(AstNodeCCall* nodep) override { + iterateChildrenConst(nodep); + iterateConst(nodep->funcp()); + } + void visit(AstNode* nodep) override { iterateChildrenConst(nodep); } + +public: + // CONSTRUCTORS + explicit DpiThreadsVisitor(AstMTaskBody* nodep) { iterateConst(nodep); } + int threads() const { return m_threads; } + ~DpiThreadsVisitor() override = default; + +private: + VL_UNCOPYABLE(DpiThreadsVisitor); +}; + //###################################################################### // FixDataHazards @@ -2451,6 +2479,8 @@ AstExecGraph* V3Order::createParallel(OrderGraph& orderGraph, const std::string& // Create the ExecMTask ExecMTask* const execMTaskp = new ExecMTask{depGraphp, bodyp}; + if (!v3Global.opt.hierBlocks().empty()) + execMTaskp->threads(DpiThreadsVisitor{bodyp}.threads()); const bool newEntry = logicMTaskToExecMTask.emplace(mTaskp, execMTaskp).second; UASSERT_OBJ(newEntry, mTaskp, "LogicMTasks should be processed in dependencyorder"); UINFO(3, "Final '" << tag << "' LogicMTask " << mTaskp->id() << " maps to ExecMTask" diff --git a/src/V3ProtectLib.cpp b/src/V3ProtectLib.cpp index 7ca088113..9b09a7a5f 100644 --- a/src/V3ProtectLib.cpp +++ b/src/V3ProtectLib.cpp @@ -18,6 +18,7 @@ #include "V3ProtectLib.h" +#include "V3Config.h" #include "V3Hasher.h" #include "V3InstrCount.h" #include "V3String.h" @@ -119,8 +120,17 @@ class ProtectVisitor final : public VNVisitor { // Mark remaining NDA protectlib wrapper DPIs as non-hazardous by deliberately forwarding // them with non-zero cost. + // Also, specify hierarchical workers for those tasks for scheduling. txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName + "_protectlib_combo_ignore\" -cost 64'd1\n"); + + txtp->addText(fl, "hier_workers -hier-dpi \"" + m_libName + + "_protectlib_combo_update\" -workers 16'd" + + std::to_string(V3Config::getHierWorkers(m_libName)) + "\n"); + txtp->addText(fl, "hier_workers -hier-dpi \"" + m_libName + + "_protectlib_seq_update\" -workers 16'd" + + std::to_string(V3Config::getHierWorkers(m_libName)) + "\n"); + // No workers for combo_ignore txtp->addText(fl, "`verilog\n"); txtp->addText(fl, "`endif\n"); } diff --git a/src/verilog.l b/src/verilog.l index d96e9f808..11b606307 100644 --- a/src/verilog.l +++ b/src/verilog.l @@ -115,6 +115,7 @@ vnum {vnum1}|{vnum2}|{vnum3}|{vnum4}|{vnum5} "full_case" { FL; return yVLT_FULL_CASE; } "hier_block" { FL; return yVLT_HIER_BLOCK; } "hier_params" { FL; return yVLT_HIER_PARAMS; } + "hier_workers" { FL; return yVLT_HIER_WORKERS; } "inline" { FL; return yVLT_INLINE; } "isolate_assignments" { FL; return yVLT_ISOLATE_ASSIGNMENTS; } "lint_off" { FL; return yVLT_LINT_OFF; } @@ -152,6 +153,7 @@ vnum {vnum1}|{vnum2}|{vnum3}|{vnum4}|{vnum5} -?"-scope" { FL; return yVLT_D_SCOPE; } -?"-task" { FL; return yVLT_D_TASK; } -?"-var" { FL; return yVLT_D_VAR; } + -?"-workers" { FL; return yVLT_D_WORKERS; } /* Reachable by attr_event_control */ "edge" { FL; return yEDGE; } diff --git a/src/verilog.y b/src/verilog.y index a6557b43e..543492cbe 100644 --- a/src/verilog.y +++ b/src/verilog.y @@ -466,6 +466,7 @@ BISONPRE_VERSION(3.7,%define api.header.include {"V3ParseBison.h"}) %token yVLT_FULL_CASE "full_case" %token yVLT_HIER_BLOCK "hier_block" %token yVLT_HIER_PARAMS "hier_params" +%token yVLT_HIER_WORKERS "hier_workers" %token yVLT_INLINE "inline" %token yVLT_ISOLATE_ASSIGNMENTS "isolate_assignments" %token yVLT_LINT_OFF "lint_off" @@ -503,6 +504,7 @@ BISONPRE_VERSION(3.7,%define api.header.include {"V3ParseBison.h"}) %token yVLT_D_SCOPE "--scope" %token yVLT_D_TASK "--task" %token yVLT_D_VAR "--var" +%token yVLT_D_WORKERS "--workers" %token yaD_PLI "${pli-system}" @@ -7660,6 +7662,10 @@ vltItem: { V3Config::addModulePragma(*$2, VPragmaType::HIER_BLOCK); } | yVLT_HIER_PARAMS vltDModuleE { V3Config::addModulePragma(*$2, VPragmaType::HIER_PARAMS); } + | yVLT_HIER_WORKERS vltDModuleE vltDWorkers + { V3Config::addHierWorkers($1, *$2, $3->toSInt()); } + | yVLT_HIER_WORKERS vltDHierDpi vltDWorkers + { V3Config::addHierWorkers($1, *$2, $3->toSInt()); } | yVLT_PARALLEL_CASE vltDFile { V3Config::addCaseParallel(*$2, 0); } | yVLT_PARALLEL_CASE vltDFile yVLT_D_LINES yaINTNUM @@ -7749,6 +7755,10 @@ vltDFTaskE: | yVLT_D_TASK str { $$ = $2; } ; +vltDWorkers: // --workers + yVLT_D_WORKERS yaINTNUM { $$ = $2; } + ; + vltInlineFront: yVLT_INLINE { $$ = true; } | yVLT_NO_INLINE { $$ = false; } diff --git a/test_regress/t/t_dotfiles.py b/test_regress/t/t_dotfiles.py index 3930a139e..af9106160 100755 --- a/test_regress/t/t_dotfiles.py +++ b/test_regress/t/t_dotfiles.py @@ -19,7 +19,7 @@ test.compile(v_flags2=["--dumpi-graph 6"], threads=2) for dotname in [ "linkcells", "task_call", "gate_graph", "gate_final", "acyc_simp", "orderg_pre", "orderg_acyc", "orderg_order", "orderg_domain", "ordermv_initial", "ordermv_hazards", - "ordermv_contraction", "ordermv_transitive1", "orderg_done", "schedule" + "ordermv_contraction", "ordermv_transitive1", "orderg_done", "pack", "schedule" ]: # Some files with identical prefix are generated multiple times during # Verilation. Ensure that at least one of each dotname-prefixed file is generated. diff --git a/test_regress/t/t_hier_block_perf.py b/test_regress/t/t_hier_block_perf.py index f73b5db4c..09b0fde37 100755 --- a/test_regress/t/t_hier_block_perf.py +++ b/test_regress/t/t_hier_block_perf.py @@ -13,17 +13,30 @@ test.scenarios('vlt_all') test.init_benchmarksim() test.cycles = (int(test.benchmark) if test.benchmark else 1000000) test.sim_time = test.cycles * 10 + 1000 -THREADS = int(os.environ["SIM_THREADS"]) if "SIM_THREADS" in os.environ else 2 -test.compile(benchmarksim=1, - v_flags2=[ - "+define+SIM_CYCLES=" + str(test.cycles), "--prof-exec", "--hierarchical", - "--stats" - ], - threads=(THREADS if test.vltmt else 1)) +THREADS = int(os.environ["THREADS"]) if "THREADS" in os.environ else 4 +HIER_BLOCK_THREADS = int( + os.environ["HIER_BLOCK_THREADS"]) if "HIER_BLOCK_THREADS" in os.environ else 2 + +config_file = test.t_dir + "/" + test.name + ".vlt" + +test.compile( + benchmarksim=1, + v_flags2=[ + config_file, "+define+SIM_CYCLES=" + str(test.cycles), "--prof-exec", "--hierarchical", + "--stats", "-Wno-UNOPTFLAT", + (f"-DWORKERS={HIER_BLOCK_THREADS}" if test.vltmt and HIER_BLOCK_THREADS > 1 else "") + ], + threads=(THREADS if test.vltmt else 1)) test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt", - r'Optimizations, Hierarchical DPI wrappers with costs\s+(\d+)', 3) + r'Optimizations, Hierarchical DPI wrappers with costs\s+(\d+)', 6) + +if test.vltmt: + test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt", + r'Optimizations, Thread schedule count\s+(\d+)', 4) + test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt", + r'Optimizations, Thread schedule total tasks\s+(\d+)', 10) test.execute(all_run_flags=[ "+verilator+prof+exec+start+2", diff --git a/test_regress/t/t_hier_block_perf.v b/test_regress/t/t_hier_block_perf.v index 59470bb03..3c95e3343 100644 --- a/test_regress/t/t_hier_block_perf.v +++ b/test_regress/t/t_hier_block_perf.v @@ -6,12 +6,8 @@ // based on t_gate_ormux -`ifndef HIER_CORES - `define HIER_CORES 3 -`endif - -`ifndef MAIN_CORES - `define MAIN_CORES 1 +`ifndef CORES + `define CORES 4 `endif module t (/*AUTOARG*/ @@ -21,37 +17,11 @@ module t (/*AUTOARG*/ input clk; generate - for (genvar i = 0; i < `MAIN_CORES; ++i) NonHierCore mainCore(clk); - endgenerate - - generate - for (genvar i = 0; i < `HIER_CORES; ++i) Core hierCore(clk); + for (genvar i = 0; i < `CORES; ++i) Core core(clk); endgenerate endmodule -module Core(input clk); /* verilator hier_block */ - reg [63:0] crc; - logic [31:0] rdata; - logic [31:0] rdata2; - wire [31:0] wdata = crc[31:0]; - wire [15:0] sel = {11'h0, crc[36:32]}; - wire we = crc[48]; - - Test test ( - // Outputs - .rdata (rdata[31:0]), - .rdata2 (rdata2[31:0]), - // Inputs - .clk (clk), - .we (we), - .sel (sel[15:0]), - .wdata (wdata[31:0])); - wire [63:0] result = {rdata2, rdata}; - - Check check(.clk(clk), .crc(crc), .result(result), .rdata(rdata), .rdata2(rdata2)); -endmodule - -module NonHierCore(input clk); +module Core(input clk); reg [63:0] crc; logic [31:0] rdata; logic [31:0] rdata2; @@ -79,7 +49,7 @@ module Check( input wire [63:0] result, input logic [31:0] rdata, input logic [31:0] rdata2 - ); + ); /*verilator hier_block*/ integer cyc = 0; reg [63:0] sum; @@ -118,7 +88,7 @@ module Test(/*AUTOARG*/ rdata, rdata2, // Inputs clk, we, sel, wdata - ); + ); /*verilator hier_block*/ input clk; input we; input [15:0] sel; diff --git a/test_regress/t/t_hier_block_perf.vlt b/test_regress/t/t_hier_block_perf.vlt new file mode 100644 index 000000000..1f7e0240a --- /dev/null +++ b/test_regress/t/t_hier_block_perf.vlt @@ -0,0 +1,11 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed into the Public Domain, for any use, +// without warranty, 2025 by Antmicro. +// SPDX-License-Identifier: CC0-1.0 + +`verilator_config +`ifdef WORKERS +hier_workers -module "Test" -workers `WORKERS +hier_workers -module "Check" -workers `WORKERS +`endif diff --git a/test_regress/t/t_hier_block_threads_bad.out b/test_regress/t/t_hier_block_threads_bad.out new file mode 100644 index 000000000..591c7055a --- /dev/null +++ b/test_regress/t/t_hier_block_threads_bad.out @@ -0,0 +1,9 @@ +%Error: t/t_hier_block_threads_bad.v:23:8: Hierarchical blocks cannot be scheduled on more threads than in thread pool, threads = 4 hierarchical block threads = 8 + : ... note: In instance 't.genblk1[1].hierCore' + 23 | module Core(input clk); /*verilator hier_block*/ + | ^~~~ +%Error-UNSUPPORTED: t/t_hier_block_threads_bad.vlt:8:1: Specifying workers for nested hierarchical blocks + 8 | hier_workers -module "Core" -workers 8 + | ^~~~~~~~~~~~ + ... For error description see https://verilator.org/warn/UNSUPPORTED?v=latest +%Error: Exiting due to diff --git a/test_regress/t/t_hier_block_threads_bad.py b/test_regress/t/t_hier_block_threads_bad.py new file mode 100755 index 000000000..1d1890dcd --- /dev/null +++ b/test_regress/t/t_hier_block_threads_bad.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2025 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vltmt') + +test.lint(fails=True, + verilator_flags2=['t/t_hier_block_threads_bad.vlt', '-DWORKERS=8', '--hierarchical'], + expect_filename=test.golden_filename, + threads=4) + +test.passes() diff --git a/test_regress/t/t_hier_block_threads_bad.v b/test_regress/t/t_hier_block_threads_bad.v new file mode 100644 index 000000000..04c72b276 --- /dev/null +++ b/test_regress/t/t_hier_block_threads_bad.v @@ -0,0 +1,32 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2025 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 + +module t (/*AUTOARG*/ + // Inputs + clk + ); + input clk; + + generate + for (genvar i = 0; i < 2; ++i) Core hierCore(clk); + endgenerate + + always @(negedge clk) begin + $write("*-* All Finished *-*\n"); + $finish; + end +endmodule + +module Core(input clk); /* verilator hier_block */ + generate + for (genvar i = 0; i < 2; ++i) SubCore sub(clk); + endgenerate + always @(posedge clk) $display("%m"); +endmodule + +module SubCore(input clk); /* verilator hier_block */ + always @(posedge clk) $display("%m"); +endmodule diff --git a/test_regress/t/t_hier_block_threads_bad.vlt b/test_regress/t/t_hier_block_threads_bad.vlt new file mode 100644 index 000000000..d6cf050b2 --- /dev/null +++ b/test_regress/t/t_hier_block_threads_bad.vlt @@ -0,0 +1,9 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed into the Public Domain, for any use, +// without warranty, 2025 by Antmicro. +// SPDX-License-Identifier: CC0-1.0 + +`verilator_config +hier_workers -module "Core" -workers `WORKERS +hier_workers -module "SubCore" -workers `WORKERS