diff --git a/include/verilated_profiler.h b/include/verilated_profiler.h index abc5ba600..362562d0d 100644 --- a/include/verilated_profiler.h +++ b/include/verilated_profiler.h @@ -220,12 +220,25 @@ class VlPgoProfiler final { // Counters are stored packed, all together to reduce cache effects std::array m_counters{}; // Time spent on this record std::vector m_records; // Record information + const uint64_t + m_currentHierBlockCost; // An original cost of a profiled hier block. During verilation + // with collected profiling data, costs of hier blocks change + // thus hashes of original mtasks does not match those from the + // previous, instrumented, run. We shall not assume that a single + // top-level mtask will correspond to the hier block as multiple + // hier block DPIs can be contracted into a single mtask. + // Therefore, the old cost, from previous instrumented run, is + // used to stabilize profiled scheduling. public: // METHODS - VlPgoProfiler() = default; + explicit VlPgoProfiler(uint64_t currentHierBlockCost = 0) + : m_currentHierBlockCost{currentHierBlockCost} {} ~VlPgoProfiler() = default; - void write(const char* modelp, const std::string& filename, bool firstHierCall) VL_MT_SAFE; + VL_UNMOVABLE(VlPgoProfiler); + VL_UNCOPYABLE(VlPgoProfiler); + void writeHeader(const std::string& filename) VL_MT_SAFE; + void write(const char* modelp, const std::string& filename) VL_MT_SAFE; void addCounter(size_t counter, const std::string& name) { VL_DEBUG_IF(assert(counter < N_Entries);); m_records.emplace_back(Record{name, counter}); @@ -239,8 +252,7 @@ public: }; template -void VlPgoProfiler::write(const char* modelp, const std::string& filename, - bool firstHierCall) VL_MT_SAFE { +void VlPgoProfiler::writeHeader(const std::string& filename) VL_MT_SAFE { static VerilatedMutex s_mutex; const VerilatedLockGuard lock{s_mutex}; @@ -250,22 +262,38 @@ void VlPgoProfiler::write(const char* modelp, const std::string& file // each will collect is own data correctly. However when each is // destroyed we need to get all the data, not keep overwriting and only // get the last model's data. - static bool s_firstCall = firstHierCall; - VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str());); - - FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a"); + FILE* const fp = std::fopen(filename.c_str(), "w"); if (VL_UNLIKELY(!fp)) { VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable"); } - if (s_firstCall) { - // TODO Perhaps merge with verilated_coverage output format, so can - // have a common merging and reporting tool, etc. - fprintf(fp, "// Verilated model profile-guided optimization data dump file\n"); - fprintf(fp, "`verilator_config\n"); + + VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file initializing '%s'\n", filename.c_str());); + + // TODO Perhaps merge with verilated_coverage output format, so can + // have a common merging and reporting tool, etc. + fprintf(fp, "// Verilated model profile-guided optimization data dump file\n"); + fprintf(fp, "`verilator_config\n"); + + std::fclose(fp); +} + +template +void VlPgoProfiler::write(const char* modelp, const std::string& filename) VL_MT_SAFE { + static VerilatedMutex s_mutex; + const VerilatedLockGuard lock{s_mutex}; + + FILE* const fp = std::fopen(filename.c_str(), "a"); + if (VL_UNLIKELY(!fp)) { + VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable"); } - s_firstCall = false; + VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str());); + + if (m_currentHierBlockCost) { + fprintf(fp, "profile_data -hier-dpi \"%s\" -cost 64'd%lu\n", modelp, + m_currentHierBlockCost); + } for (const Record& rec : m_records) { fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp, diff --git a/src/V3Control.cpp b/src/V3Control.cpp index cef5ca180..23b7ec30b 100644 --- a/src/V3Control.cpp +++ b/src/V3Control.cpp @@ -18,6 +18,7 @@ #include "V3Control.h" +#include "V3InstrCount.h" #include "V3String.h" #include @@ -574,7 +575,11 @@ public: ProfileDataMode mode = MTASK) { if (!m_profileFileLine) m_profileFileLine = fl; if (cost == 0) cost = 1; // Cost 0 means delete (or no data) - m_profileData[model][key] += cost; + if (mode == MTASK) { + m_profileData[model][key] += cost; + } else if (mode == HIER_DPI) { + m_profileData[model][key] = std::max(m_profileData[model][key], cost); + } m_mode |= mode; } bool containsMTaskProfileData() const { return m_mode & MTASK; } @@ -603,6 +608,16 @@ public: return it->second; } FileLine* getProfileDataFileLine() const { return m_profileFileLine; } // Maybe null + static uint64_t getCurrentHierBlockCost() { + if (uint64_t cost = V3Control::getProfileData(v3Global.opt.prefix())) { + UINFO(9, "Fetching cost from profile info: " << cost); + return cost; + } else { + cost = V3InstrCount::count(v3Global.rootp()->evalp(), false); + UINFO(9, "Evaluating cost: " << cost); + return cost; + } + } }; //###################################################################### @@ -797,6 +812,9 @@ void V3Control::contentsPushText(const string& text) { return WildcardContents:: bool V3Control::containsMTaskProfileData() { return V3ControlResolver::s().containsMTaskProfileData(); } +uint64_t V3Control::getCurrentHierBlockCost() { + return V3ControlResolver::s().getCurrentHierBlockCost(); +} bool V3Control::waive(const FileLine* filelinep, V3ErrorCode code, const string& message) { V3ControlFile* const filep = V3ControlResolver::s().files().resolve(filelinep->filename()); diff --git a/src/V3Control.h b/src/V3Control.h index 3b3142158..fcc807620 100644 --- a/src/V3Control.h +++ b/src/V3Control.h @@ -64,6 +64,7 @@ public: static void contentsPushText(const string& text); static bool containsMTaskProfileData(); + static uint64_t getCurrentHierBlockCost(); static bool waive(const FileLine* filelinep, V3ErrorCode code, const string& message); }; diff --git a/src/V3EmitCSyms.cpp b/src/V3EmitCSyms.cpp index f26d98d3b..c72db2635 100644 --- a/src/V3EmitCSyms.cpp +++ b/src/V3EmitCSyms.cpp @@ -491,6 +491,11 @@ void EmitCSyms::emitSymHdr() { puts("VlExecutionProfiler* const __Vm_executionProfilerp;\n"); } + if (v3Global.opt.profPgo()) { + puts("\n// PGO PROFILING\n"); + puts("VlPgoProfiler<" + std::to_string(ExecMTask::numUsedIds()) + "> _vm_pgoProfiler;\n"); + } + puts("\n// MODULE INSTANCE STATE\n"); for (const auto& i : m_scopes) { const AstScope* const scopep = i.first; @@ -509,11 +514,6 @@ void EmitCSyms::emitSymHdr() { puts("];\n"); } - if (v3Global.opt.profPgo()) { - puts("\n// PGO PROFILING\n"); - puts("VlPgoProfiler<" + std::to_string(ExecMTask::numUsedIds()) + "> _vm_pgoProfiler;\n"); - } - if (!m_scopeNames.empty()) { // Scope names puts("\n// SCOPE NAMES\n"); for (const auto& itr : m_scopeNames) { @@ -728,11 +728,8 @@ void EmitCSyms::emitSymImp() { puts("#endif // VM_TRACE\n"); } if (v3Global.opt.profPgo()) { - // Do not overwrite data during the last hierarchical stage. - const string firstHierCall - = (v3Global.opt.hierBlocks().empty() || v3Global.opt.hierChild()) ? "true" : "false"; puts("_vm_pgoProfiler.write(\"" + EmitCUtil::topClassName() - + "\", _vm_contextp__->profVltFilename(), " + firstHierCall + ");\n"); + + "\", _vm_contextp__->profVltFilename());\n"); } puts("}\n"); @@ -802,6 +799,9 @@ void EmitCSyms::emitSymImp() { "__Vm_executionProfilerp{static_cast(contextp->" "enableExecutionProfiler(&VlExecutionProfiler::construct))}\n"); } + if (v3Global.opt.profPgo() && !v3Global.opt.libCreate().empty()) { + puts(" , _vm_pgoProfiler{" + std::to_string(v3Global.currentHierBlockCost()) + "}\n"); + } puts(" // Setup module instances\n"); for (const auto& i : m_scopes) { @@ -835,6 +835,9 @@ void EmitCSyms::emitSymImp() { if (v3Global.opt.profPgo()) { puts("// Configure profiling for PGO\n"); + if (!v3Global.opt.hierChild()) { + puts("_vm_pgoProfiler.writeHeader(_vm_contextp__->profVltFilename());\n"); + } if (v3Global.opt.mtasks()) { v3Global.rootp()->topModulep()->foreach([&](const AstExecGraph* execGraphp) { for (const V3GraphVertex& vtx : execGraphp->depGraphp()->vertices()) { diff --git a/src/V3Global.h b/src/V3Global.h index 3c4e502ff..a9706c46a 100644 --- a/src/V3Global.h +++ b/src/V3Global.h @@ -127,6 +127,7 @@ class V3Global final { bool m_hasSCTextSections = false; // Has `systemc_* sections that need to be emitted bool m_useParallelBuild = false; // Use parallel build for model bool m_useRandomizeMethods = false; // Need to define randomize() class methods + uint64_t m_currentHierBlockCost = 0; // Total cost of this hier block, used for scheduling // Memory address to short string mapping (for debug) std::unordered_map @@ -214,6 +215,8 @@ public: const std::string& ptrToId(const void* p); std::thread::id mainThreadId() const { return m_mainThreadId; } static std::vector verilatedCppFiles(); + uint64_t currentHierBlockCost() const { return m_currentHierBlockCost; } + void currentHierBlockCost(uint64_t cost) { m_currentHierBlockCost = cost; } }; extern V3Global v3Global; diff --git a/src/V3ProtectLib.cpp b/src/V3ProtectLib.cpp index b996b6c6d..0f680edfd 100644 --- a/src/V3ProtectLib.cpp +++ b/src/V3ProtectLib.cpp @@ -24,8 +24,6 @@ #include "V3String.h" #include "V3Task.h" -#include - VL_DEFINE_DEBUG_FUNCTIONS; //###################################################################### @@ -104,21 +102,14 @@ class ProtectVisitor final : public VNVisitor { txtp->addText(fl, "\n`ifdef VERILATOR\n"); txtp->addText(fl, "`verilator_config\n"); - // The `eval` function is called inside both update functions. As those functions - // are created by text bashing, we need to find cost of `_eval` which is the first function - // with a real cost in AST. - uint32_t cost = 0; - modp->foreach([&cost](AstCFunc* cfuncp) { - if (cfuncp->name() == "_eval") cost = V3InstrCount::count(cfuncp, false); - }); txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName - + "_protectlib_combo_update\" -cost 64'd" + std::to_string(cost) - + "\n"); + + "_protectlib_combo_update\" -cost 64'd" + + std::to_string(v3Global.currentHierBlockCost()) + "\n"); txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName - + "_protectlib_seq_update\" -cost 64'd" + std::to_string(cost) - + "\n"); + + "_protectlib_seq_update\" -cost 64'd" + + std::to_string(v3Global.currentHierBlockCost()) + "\n"); - // Mark remaining NDA protectlib wrapper DPIs as non-hazardous by deliberately forwarding + // Mark remaining NBA protectlib wrapper DPIs as non-hazardous by deliberately forwarding // them with non-zero cost. // Also, specify hierarchical workers for those tasks for scheduling. txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName diff --git a/src/Verilator.cpp b/src/Verilator.cpp index 7479b269e..99070ffe4 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -34,6 +34,7 @@ #include "V3Combine.h" #include "V3Common.h" #include "V3Const.h" +#include "V3Control.h" #include "V3Coverage.h" #include "V3CoverageJoin.h" #include "V3Dead.h" @@ -590,6 +591,11 @@ static void process() { // Create AstCUse to determine what class forward declarations/#includes needed in C V3CUse::cUseAll(); + + // Evaluate cost of a current hierarchical block + if (!v3Global.opt.libCreate().empty()) { + v3Global.currentHierBlockCost(V3Control::getCurrentHierBlockCost()); + } } // Output the text diff --git a/test_regress/t/t_hier_block_perf.v b/test_regress/t/t_hier_block_perf.v index 3c95e3343..03f7e0bfd 100644 --- a/test_regress/t/t_hier_block_perf.v +++ b/test_regress/t/t_hier_block_perf.v @@ -18,6 +18,7 @@ module t (/*AUTOARG*/ generate for (genvar i = 0; i < `CORES; ++i) Core core(clk); + for (genvar i = 0; i < `CORES; ++i) CoreHier hierCore(clk); endgenerate endmodule @@ -43,6 +44,15 @@ module Core(input clk); Check check(.clk(clk), .crc(crc), .result(result), .rdata(rdata), .rdata2(rdata2)); endmodule +module CoreHier(input clk); + // Dummy logic to have two different hier blocks at the same level. + integer cyc = 0; + always @(posedge clk) begin + cyc += 1; + if (cyc == 1) $display("%d", clk); + end +endmodule + module Check( input clk, output reg [63:0] crc, diff --git a/test_regress/t/t_pgo_threads_hier.py b/test_regress/t/t_pgo_threads_hier.py index 438fd97a6..0c0449825 100755 --- a/test_regress/t/t_pgo_threads_hier.py +++ b/test_regress/t/t_pgo_threads_hier.py @@ -11,11 +11,12 @@ import vltest_bootstrap test.scenarios('vltmt') test.top_filename = "t/t_hier_block_perf.v" -cycles = 100000 +cycles = 100 test.sim_time = cycles * 10 + 1000 threads = 2 -flags = ["--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)] +config_file = test.t_dir + "/" + test.name + ".vlt" +flags = [config_file, "--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)] test.compile(benchmarksim=1, v_flags2=["--prof-pgo"] + flags, threads=threads) @@ -24,15 +25,21 @@ test.execute(all_run_flags=[ " +verilator+prof+exec+file+/dev/null", " +verilator+prof+vlt+file+" + test.obj_dir + "/profile.vlt"]) # yapf:disable +test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VTest"') +test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VCheck"') +test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VCoreHier"') test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "V' + test.name + '"') +# Check for cost rollovers +test.file_grep_not(test.obj_dir + "/profile.vlt", r'.*cost 64\'d\d{18}.*') + # Differentiate benchmarksim results test.name = test.name + "_optimized" test.compile( benchmarksim=1, # Intentionally no --prof-pgo here to make sure profile data can be read in # without it (that is: --prof-pgo has no effect on profile_data hash names) - v_flags2=flags, + v_flags2=[test.obj_dir + "/profile.vlt"] + flags, threads=threads) test.execute() diff --git a/test_regress/t/t_pgo_threads_hier_nested.vlt b/test_regress/t/t_pgo_threads_hier.vlt similarity index 81% rename from test_regress/t/t_pgo_threads_hier_nested.vlt rename to test_regress/t/t_pgo_threads_hier.vlt index 3dedcb17f..caa255202 100644 --- a/test_regress/t/t_pgo_threads_hier_nested.vlt +++ b/test_regress/t/t_pgo_threads_hier.vlt @@ -8,3 +8,5 @@ hier_workers -module "Test" -workers 2 hier_block -module "Check" hier_workers -module "Check" -workers 2 +hier_block -module "CoreHier" +hier_workers -module "CoreHier" -workers 2 diff --git a/test_regress/t/t_pgo_threads_hier_nested.py b/test_regress/t/t_pgo_threads_hier_nested.py deleted file mode 100755 index 55eaa6461..000000000 --- a/test_regress/t/t_pgo_threads_hier_nested.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -# DESCRIPTION: Verilator: Verilog Test driver/expect definition -# -# Copyright 2025 by Wilson Snyder. This program is free software; you -# can redistribute it and/or modify it under the terms of either the GNU -# Lesser General Public License Version 3 or the Perl Artistic License -# Version 2.0. -# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 - -import vltest_bootstrap - -test.scenarios('vltmt') -test.top_filename = "t/t_hier_block_perf.v" -cycles = 100000 -test.sim_time = cycles * 10 + 1000 - -threads = 2 -config_file = test.t_dir + "/" + test.name + ".vlt" -flags = [config_file, "--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)] - -test.compile(benchmarksim=1, v_flags2=["--prof-pgo"] + flags, threads=threads) - -test.execute(all_run_flags=[ - "+verilator+prof+exec+start+0", - " +verilator+prof+exec+file+/dev/null", - " +verilator+prof+vlt+file+" + test.obj_dir + "/profile.vlt"]) # yapf:disable - -test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VTest"') -test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "V' + test.name + '"') - -# Check for cost rollovers -test.file_grep_not(test.obj_dir + "/profile.vlt", r'.*cost 64\'d\d{18}.*') - -# Differentiate benchmarksim results -test.name = test.name + "_optimized" -test.compile( - benchmarksim=1, - # Intentionally no --prof-pgo here to make sure profile data can be read in - # without it (that is: --prof-pgo has no effect on profile_data hash names) - v_flags2=flags, - threads=threads) - -test.execute() - -test.passes()