Fix hierarchical `--prof-pgo` (#6213)

This commit is contained in:
Bartłomiej Chmiel 2025-09-13 16:19:00 +02:00 committed by GitHub
parent 907047d823
commit be813e96dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 110 additions and 86 deletions

View File

@ -220,12 +220,25 @@ class VlPgoProfiler final {
// Counters are stored packed, all together to reduce cache effects
std::array<uint64_t, N_Entries> m_counters{}; // Time spent on this record
std::vector<Record> m_records; // Record information
const uint64_t
m_currentHierBlockCost; // An original cost of a profiled hier block. During verilation
// with collected profiling data, costs of hier blocks change
// thus hashes of original mtasks does not match those from the
// previous, instrumented, run. We shall not assume that a single
// top-level mtask will correspond to the hier block as multiple
// hier block DPIs can be contracted into a single mtask.
// Therefore, the old cost, from previous instrumented run, is
// used to stabilize profiled scheduling.
public:
// METHODS
VlPgoProfiler() = default;
explicit VlPgoProfiler(uint64_t currentHierBlockCost = 0)
: m_currentHierBlockCost{currentHierBlockCost} {}
~VlPgoProfiler() = default;
void write(const char* modelp, const std::string& filename, bool firstHierCall) VL_MT_SAFE;
VL_UNMOVABLE(VlPgoProfiler);
VL_UNCOPYABLE(VlPgoProfiler);
void writeHeader(const std::string& filename) VL_MT_SAFE;
void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
void addCounter(size_t counter, const std::string& name) {
VL_DEBUG_IF(assert(counter < N_Entries););
m_records.emplace_back(Record{name, counter});
@ -239,8 +252,7 @@ public:
};
template <std::size_t N_Entries>
void VlPgoProfiler<N_Entries>::write(const char* modelp, const std::string& filename,
bool firstHierCall) VL_MT_SAFE {
void VlPgoProfiler<N_Entries>::writeHeader(const std::string& filename) VL_MT_SAFE {
static VerilatedMutex s_mutex;
const VerilatedLockGuard lock{s_mutex};
@ -250,22 +262,38 @@ void VlPgoProfiler<N_Entries>::write(const char* modelp, const std::string& file
// each will collect is own data correctly. However when each is
// destroyed we need to get all the data, not keep overwriting and only
// get the last model's data.
static bool s_firstCall = firstHierCall;
VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););
FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a");
FILE* const fp = std::fopen(filename.c_str(), "w");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
}
if (s_firstCall) {
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
fprintf(fp, "`verilator_config\n");
VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file initializing '%s'\n", filename.c_str()););
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
fprintf(fp, "`verilator_config\n");
std::fclose(fp);
}
template <std::size_t N_Entries>
void VlPgoProfiler<N_Entries>::write(const char* modelp, const std::string& filename) VL_MT_SAFE {
static VerilatedMutex s_mutex;
const VerilatedLockGuard lock{s_mutex};
FILE* const fp = std::fopen(filename.c_str(), "a");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
}
s_firstCall = false;
VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););
if (m_currentHierBlockCost) {
fprintf(fp, "profile_data -hier-dpi \"%s\" -cost 64'd%lu\n", modelp,
m_currentHierBlockCost);
}
for (const Record& rec : m_records) {
fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp,

View File

@ -18,6 +18,7 @@
#include "V3Control.h"
#include "V3InstrCount.h"
#include "V3String.h"
#include <memory>
@ -574,7 +575,11 @@ public:
ProfileDataMode mode = MTASK) {
if (!m_profileFileLine) m_profileFileLine = fl;
if (cost == 0) cost = 1; // Cost 0 means delete (or no data)
m_profileData[model][key] += cost;
if (mode == MTASK) {
m_profileData[model][key] += cost;
} else if (mode == HIER_DPI) {
m_profileData[model][key] = std::max(m_profileData[model][key], cost);
}
m_mode |= mode;
}
bool containsMTaskProfileData() const { return m_mode & MTASK; }
@ -603,6 +608,16 @@ public:
return it->second;
}
FileLine* getProfileDataFileLine() const { return m_profileFileLine; } // Maybe null
static uint64_t getCurrentHierBlockCost() {
if (uint64_t cost = V3Control::getProfileData(v3Global.opt.prefix())) {
UINFO(9, "Fetching cost from profile info: " << cost);
return cost;
} else {
cost = V3InstrCount::count(v3Global.rootp()->evalp(), false);
UINFO(9, "Evaluating cost: " << cost);
return cost;
}
}
};
//######################################################################
@ -797,6 +812,9 @@ void V3Control::contentsPushText(const string& text) { return WildcardContents::
bool V3Control::containsMTaskProfileData() {
return V3ControlResolver::s().containsMTaskProfileData();
}
uint64_t V3Control::getCurrentHierBlockCost() {
return V3ControlResolver::s().getCurrentHierBlockCost();
}
bool V3Control::waive(const FileLine* filelinep, V3ErrorCode code, const string& message) {
V3ControlFile* const filep = V3ControlResolver::s().files().resolve(filelinep->filename());

View File

@ -64,6 +64,7 @@ public:
static void contentsPushText(const string& text);
static bool containsMTaskProfileData();
static uint64_t getCurrentHierBlockCost();
static bool waive(const FileLine* filelinep, V3ErrorCode code, const string& message);
};

View File

@ -491,6 +491,11 @@ void EmitCSyms::emitSymHdr() {
puts("VlExecutionProfiler* const __Vm_executionProfilerp;\n");
}
if (v3Global.opt.profPgo()) {
puts("\n// PGO PROFILING\n");
puts("VlPgoProfiler<" + std::to_string(ExecMTask::numUsedIds()) + "> _vm_pgoProfiler;\n");
}
puts("\n// MODULE INSTANCE STATE\n");
for (const auto& i : m_scopes) {
const AstScope* const scopep = i.first;
@ -509,11 +514,6 @@ void EmitCSyms::emitSymHdr() {
puts("];\n");
}
if (v3Global.opt.profPgo()) {
puts("\n// PGO PROFILING\n");
puts("VlPgoProfiler<" + std::to_string(ExecMTask::numUsedIds()) + "> _vm_pgoProfiler;\n");
}
if (!m_scopeNames.empty()) { // Scope names
puts("\n// SCOPE NAMES\n");
for (const auto& itr : m_scopeNames) {
@ -728,11 +728,8 @@ void EmitCSyms::emitSymImp() {
puts("#endif // VM_TRACE\n");
}
if (v3Global.opt.profPgo()) {
// Do not overwrite data during the last hierarchical stage.
const string firstHierCall
= (v3Global.opt.hierBlocks().empty() || v3Global.opt.hierChild()) ? "true" : "false";
puts("_vm_pgoProfiler.write(\"" + EmitCUtil::topClassName()
+ "\", _vm_contextp__->profVltFilename(), " + firstHierCall + ");\n");
+ "\", _vm_contextp__->profVltFilename());\n");
}
puts("}\n");
@ -802,6 +799,9 @@ void EmitCSyms::emitSymImp() {
"__Vm_executionProfilerp{static_cast<VlExecutionProfiler*>(contextp->"
"enableExecutionProfiler(&VlExecutionProfiler::construct))}\n");
}
if (v3Global.opt.profPgo() && !v3Global.opt.libCreate().empty()) {
puts(" , _vm_pgoProfiler{" + std::to_string(v3Global.currentHierBlockCost()) + "}\n");
}
puts(" // Setup module instances\n");
for (const auto& i : m_scopes) {
@ -835,6 +835,9 @@ void EmitCSyms::emitSymImp() {
if (v3Global.opt.profPgo()) {
puts("// Configure profiling for PGO\n");
if (!v3Global.opt.hierChild()) {
puts("_vm_pgoProfiler.writeHeader(_vm_contextp__->profVltFilename());\n");
}
if (v3Global.opt.mtasks()) {
v3Global.rootp()->topModulep()->foreach([&](const AstExecGraph* execGraphp) {
for (const V3GraphVertex& vtx : execGraphp->depGraphp()->vertices()) {

View File

@ -127,6 +127,7 @@ class V3Global final {
bool m_hasSCTextSections = false; // Has `systemc_* sections that need to be emitted
bool m_useParallelBuild = false; // Use parallel build for model
bool m_useRandomizeMethods = false; // Need to define randomize() class methods
uint64_t m_currentHierBlockCost = 0; // Total cost of this hier block, used for scheduling
// Memory address to short string mapping (for debug)
std::unordered_map<const void*, std::string>
@ -214,6 +215,8 @@ public:
const std::string& ptrToId(const void* p);
std::thread::id mainThreadId() const { return m_mainThreadId; }
static std::vector<std::string> verilatedCppFiles();
uint64_t currentHierBlockCost() const { return m_currentHierBlockCost; }
void currentHierBlockCost(uint64_t cost) { m_currentHierBlockCost = cost; }
};
extern V3Global v3Global;

View File

@ -24,8 +24,6 @@
#include "V3String.h"
#include "V3Task.h"
#include <list>
VL_DEFINE_DEBUG_FUNCTIONS;
//######################################################################
@ -104,21 +102,14 @@ class ProtectVisitor final : public VNVisitor {
txtp->addText(fl, "\n`ifdef VERILATOR\n");
txtp->addText(fl, "`verilator_config\n");
// The `eval` function is called inside both update functions. As those functions
// are created by text bashing, we need to find cost of `_eval` which is the first function
// with a real cost in AST.
uint32_t cost = 0;
modp->foreach([&cost](AstCFunc* cfuncp) {
if (cfuncp->name() == "_eval") cost = V3InstrCount::count(cfuncp, false);
});
txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName
+ "_protectlib_combo_update\" -cost 64'd" + std::to_string(cost)
+ "\n");
+ "_protectlib_combo_update\" -cost 64'd"
+ std::to_string(v3Global.currentHierBlockCost()) + "\n");
txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName
+ "_protectlib_seq_update\" -cost 64'd" + std::to_string(cost)
+ "\n");
+ "_protectlib_seq_update\" -cost 64'd"
+ std::to_string(v3Global.currentHierBlockCost()) + "\n");
// Mark remaining NDA protectlib wrapper DPIs as non-hazardous by deliberately forwarding
// Mark remaining NBA protectlib wrapper DPIs as non-hazardous by deliberately forwarding
// them with non-zero cost.
// Also, specify hierarchical workers for those tasks for scheduling.
txtp->addText(fl, "profile_data -hier-dpi \"" + m_libName

View File

@ -34,6 +34,7 @@
#include "V3Combine.h"
#include "V3Common.h"
#include "V3Const.h"
#include "V3Control.h"
#include "V3Coverage.h"
#include "V3CoverageJoin.h"
#include "V3Dead.h"
@ -590,6 +591,11 @@ static void process() {
// Create AstCUse to determine what class forward declarations/#includes needed in C
V3CUse::cUseAll();
// Evaluate cost of a current hierarchical block
if (!v3Global.opt.libCreate().empty()) {
v3Global.currentHierBlockCost(V3Control::getCurrentHierBlockCost());
}
}
// Output the text

View File

@ -18,6 +18,7 @@ module t (/*AUTOARG*/
generate
for (genvar i = 0; i < `CORES; ++i) Core core(clk);
for (genvar i = 0; i < `CORES; ++i) CoreHier hierCore(clk);
endgenerate
endmodule
@ -43,6 +44,15 @@ module Core(input clk);
Check check(.clk(clk), .crc(crc), .result(result), .rdata(rdata), .rdata2(rdata2));
endmodule
module CoreHier(input clk);
// Dummy logic to have two different hier blocks at the same level.
integer cyc = 0;
always @(posedge clk) begin
cyc += 1;
if (cyc == 1) $display("%d", clk);
end
endmodule
module Check(
input clk,
output reg [63:0] crc,

View File

@ -11,11 +11,12 @@ import vltest_bootstrap
test.scenarios('vltmt')
test.top_filename = "t/t_hier_block_perf.v"
cycles = 100000
cycles = 100
test.sim_time = cycles * 10 + 1000
threads = 2
flags = ["--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)]
config_file = test.t_dir + "/" + test.name + ".vlt"
flags = [config_file, "--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)]
test.compile(benchmarksim=1, v_flags2=["--prof-pgo"] + flags, threads=threads)
@ -24,15 +25,21 @@ test.execute(all_run_flags=[
" +verilator+prof+exec+file+/dev/null",
" +verilator+prof+vlt+file+" + test.obj_dir + "/profile.vlt"]) # yapf:disable
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VTest"')
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VCheck"')
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VCoreHier"')
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "V' + test.name + '"')
# Check for cost rollovers
test.file_grep_not(test.obj_dir + "/profile.vlt", r'.*cost 64\'d\d{18}.*')
# Differentiate benchmarksim results
test.name = test.name + "_optimized"
test.compile(
benchmarksim=1,
# Intentionally no --prof-pgo here to make sure profile data can be read in
# without it (that is: --prof-pgo has no effect on profile_data hash names)
v_flags2=flags,
v_flags2=[test.obj_dir + "/profile.vlt"] + flags,
threads=threads)
test.execute()

View File

@ -8,3 +8,5 @@
hier_workers -module "Test" -workers 2
hier_block -module "Check"
hier_workers -module "Check" -workers 2
hier_block -module "CoreHier"
hier_workers -module "CoreHier" -workers 2

View File

@ -1,45 +0,0 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2025 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vltmt')
test.top_filename = "t/t_hier_block_perf.v"
cycles = 100000
test.sim_time = cycles * 10 + 1000
threads = 2
config_file = test.t_dir + "/" + test.name + ".vlt"
flags = [config_file, "--hierarchical", "-Wno-UNOPTFLAT", "-DSIM_CYCLES=" + str(cycles)]
test.compile(benchmarksim=1, v_flags2=["--prof-pgo"] + flags, threads=threads)
test.execute(all_run_flags=[
"+verilator+prof+exec+start+0",
" +verilator+prof+exec+file+/dev/null",
" +verilator+prof+vlt+file+" + test.obj_dir + "/profile.vlt"]) # yapf:disable
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "VTest"')
test.file_grep(test.obj_dir + "/profile.vlt", r'profile_data -model "V' + test.name + '"')
# Check for cost rollovers
test.file_grep_not(test.obj_dir + "/profile.vlt", r'.*cost 64\'d\d{18}.*')
# Differentiate benchmarksim results
test.name = test.name + "_optimized"
test.compile(
benchmarksim=1,
# Intentionally no --prof-pgo here to make sure profile data can be read in
# without it (that is: --prof-pgo has no effect on profile_data hash names)
v_flags2=flags,
threads=threads)
test.execute()
test.passes()