Add verilator_gantt profiling of DPI imports (#3084).

This commit is contained in:
Wilson Snyder 2025-09-21 11:37:44 -04:00
parent 53b8a5b027
commit 9697a5ce6d
10 changed files with 130 additions and 25 deletions

View File

@ -14,6 +14,7 @@ Verilator 5.041 devel
**Other:**
* Add error on zero/negative unpacked dimensions (#1642). [Stefan Wallentowitz]
* Add verilator_gantt profiling of DPI imports (#3084). [Geza Lore]
* Add error on non-packed struct randc (#5999). [Seth Pellegrino]
* Add configure `--enable-asan` to compile verilator_bin with the address sanitizer (#6404). [Geza Lore]
* Add $(LDFLAGS) and $(LIBS) to when building shared libraries (#6425) (#6426). [Ahmed El-Mahmoudy]

View File

@ -51,6 +51,7 @@ def read_data(filename):
re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$')
cpu = None
thread = 0
thread_last_ecpu = None
global LongestVcdStrValueLength
global ExecGraphTime
@ -109,9 +110,13 @@ def read_data(filename):
tick)
elif kind == "THREAD_SCHEDULE_WAIT_BEGIN":
ecpu = int(re_payload_wait.match(payload).groups()[0])
thread_last_ecpu = ecpu
ThreadScheduleWait[ecpu].append(tick)
elif kind == "THREAD_SCHEDULE_WAIT_END":
ecpu = int(re_payload_wait.match(payload).groups()[0])
# Might have ended on different CPU then we got THREAD_SCHEDULE_WAIT_BEGIN
assert thread_last_ecpu is not None, "THREAD_SCHEDULE_WAIT_END without BEGIN"
ecpu = thread_last_ecpu
thread_last_ecpu = None
start = ThreadScheduleWait[ecpu].pop()
WaitingTime += tick - start
ThreadScheduleWaitIntervals.append((start, tick, ecpu))

View File

@ -446,7 +446,7 @@ void EmitCSyms::emitSymHdr() {
}
puts("\n// SYMS CLASS (contains all model state)\n");
puts("class alignas(VL_CACHE_LINE_BYTES)" + EmitCUtil::symClassName()
puts("class alignas(VL_CACHE_LINE_BYTES) " + EmitCUtil::symClassName()
+ " final : public VerilatedSyms {\n");
ofp()->putsPrivate(false); // public:

View File

@ -681,7 +681,7 @@ class TaskVisitor final : public VNVisitor {
}
}
// First argument is symbol table, then output if a function
const bool needSyms = !refp->taskp()->dpiImport();
const bool needSyms = !refp->taskp()->dpiImport() || v3Global.opt.profExec();
if (needSyms) ccallp->argTypes("vlSymsp");
if (refp->taskp()->dpiContext()) {
@ -972,7 +972,7 @@ class TaskVisitor final : public VNVisitor {
if (rtnvarp) {
funcp->addStmtsp(createDpiTemp(rtnvarp, ""));
funcp->addStmtsp(createAssignInternalToDpi(rtnvarp, false, tmpSuffixp, ""));
string stmt = "return " + rtnvarp->name();
string stmt = "return " + rtnvarp->name(); // TODO use AstCReturn?
stmt += rtnvarp->basicp()->isDpiPrimitive() ? ";\n" : "[0];\n";
funcp->addStmtsp(new AstCStmt{nodep->fileline(), stmt});
}
@ -1077,6 +1077,12 @@ class TaskVisitor final : public VNVisitor {
void bodyDpiImportFunc(AstNodeFTask* nodep, AstVarScope* rtnvscp, AstCFunc* cfuncp,
AstCFunc* dpiFuncp) {
const char* const tmpSuffixp = V3Task::dpiTemporaryVarSuffix();
if (v3Global.opt.profExec())
cfuncp->addStmtsp(
new AstCStmt{nodep->fileline(),
"VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"dpiimports\");\n"});
// Convert input/inout arguments to DPI types
string args;
for (AstNode* stmtp = cfuncp->argsp(); stmtp; stmtp = stmtp->nextp()) {
@ -1162,6 +1168,10 @@ class TaskVisitor final : public VNVisitor {
}
}
}
if (v3Global.opt.profExec())
cfuncp->addStmtsp(new AstCStmt{nodep->fileline(),
"VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"});
}
AstVarScope* getDpiExporTrigger() {
@ -1285,9 +1295,12 @@ class TaskVisitor final : public VNVisitor {
if (cfuncp->dpiImportWrapper()) cfuncp->cname(nodep->cname());
const bool needSyms
= (!nodep->dpiImport() && !nodep->taskPublic()) || v3Global.opt.profExec();
if (needSyms) cfuncp->argTypes(EmitCUtil::symClassVar());
if (!nodep->dpiImport() && !nodep->taskPublic()) {
// Need symbol table
cfuncp->argTypes(EmitCUtil::symClassVar());
if (cfuncp->name() == "new") {
const string stmt = VIdProtect::protect("_ctor_var_reset") + "(vlSymsp);\n";
cfuncp->addInitsp(new AstCStmt{nodep->fileline(), stmt});

View File

@ -12,10 +12,11 @@
import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
test.top_filename = "t/t_gantt.v"
test.pli_filename = "t/t_gantt_c.cpp"
test.compile(
v_flags2=["--prof-exec"],
verilator_flags2=["--prof-exec", test.pli_filename],
# Checks below care about thread count, so use 2 (minimum reasonable)
threads=(2 if test.vltmt else 1))
@ -35,13 +36,13 @@ test.run(cmd=[
])
if test.vltmt:
test.file_grep(gantt_log, r'Total threads += 2')
test.file_grep(gantt_log, r'Total mtasks += 7')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 2)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6)
# Predicted thread utilization should be less than 100%
test.file_grep_not(gantt_log, r'Thread utilization =\s*\d\d\d+\.\d+%')
else:
test.file_grep(gantt_log, r'Total threads += 1')
test.file_grep(gantt_log, r'Total mtasks += 0')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 1)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0)
test.file_grep(gantt_log, r'\|\s+2\s+\|\s+2\.0+\s+\|\s+eval')

70
test_regress/t/t_gantt.v Normal file
View File

@ -0,0 +1,70 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain, for
// any use, without warranty, 2021 by Wilson Snyder.
// SPDX-License-Identifier: CC0-1.0
module t(
input clk
);
integer cyc = 0;
wire [63:0] result;
Test test(/*AUTOINST*/
// Outputs
.result (result[63:0]),
// Inputs
.clk (clk),
.cyc (cyc));
reg [63:0] sum;
always @ (posedge clk) begin
`ifdef TEST_VERBOSE
$write("[%0t] cyc==%0d result=%x\n", $time, cyc, result);
`endif
cyc <= cyc + 1;
sum <= result ^ {sum[62:0], sum[63] ^ sum[2] ^ sum[0]};
if (cyc == 0) begin
// Setup
sum <= '0;
end
else if (cyc < 10) begin
sum <= '0;
end
else if (cyc == 99) begin
$write("[%0t] cyc==%0d sum=%x\n", $time, cyc, sum);
// What checksum will we end up with (above print should match)
`define EXPECTED_SUM 64'haf665a181ead5e12
if (sum !== `EXPECTED_SUM) $stop;
$write("*-* All Finished *-*\n");
$finish;
end
end
endmodule
module Test(/*AUTOARG*/
// Outputs
result,
// Inputs
clk, cyc
);
input clk;
input int cyc;
output reg [63:0] result;
logic [63:0] adder;
import "DPI-C" pure function int dpii_return(input int i);
always @(posedge clk) begin
adder = 0;
for (int i = 0; i < 100000; ++i)
adder += {32'h0, (cyc+i)} ** 3 + {32'h0, dpii_return(1)};
result <= adder;
end
endmodule

View File

@ -0,0 +1,12 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
//
// This file ONLY is placed under the Creative Commons Public Domain, for
// any use, without warranty, 2025 by Wilson Snyder.
// SPDX-License-Identifier: CC0-1.0
//
//*************************************************************************
extern "C" {
int dpii_return(int i) { return i; }
}

View File

@ -12,10 +12,11 @@
import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
test.top_filename = "t/t_gantt.v"
test.pli_filename = "t/t_gantt_c.cpp"
test.compile(
v_flags2=["--prof-exec", "--hierarchical"],
verilator_flags2=["--prof-exec", "--hierarchical", test.pli_filename],
# Checks below care about thread count, so use 2 (minimum reasonable)
threads=(2 if test.vltmt else 1))
@ -35,13 +36,13 @@ test.run(cmd=[
])
if test.vltmt:
test.file_grep(gantt_log, r'Total threads += 2')
test.file_grep(gantt_log, r'Total mtasks += 11')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 2)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6)
# Predicted thread utilization should be less than 100%
test.file_grep_not(gantt_log, r'Thread utilization =\s*\d\d\d+\.\d+%')
else:
test.file_grep(gantt_log, r'Total threads += 1')
test.file_grep(gantt_log, r'Total mtasks += 0')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 1)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0)
test.file_grep(gantt_log, r'\|\s+2\s+\|\s+2\.0+\s+\|\s+eval')

View File

@ -12,10 +12,11 @@
import vltest_bootstrap
test.scenarios('vltmt')
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
test.top_filename = "t/t_gantt.v"
test.pli_filename = "t/t_gantt_c.cpp"
test.compile(
v_flags2=["--prof-exec"],
verilator_flags2=["--prof-exec", test.pli_filename],
# Checks below care about thread count
threads=4)

View File

@ -12,14 +12,15 @@
import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles
test.top_filename = "t/t_gantt.v"
test.pli_filename = "t/t_gantt_c.cpp"
threads_num = (2 if test.vltmt else 1)
test.compile(
make_top_shell=False,
make_main=False,
v_flags2=["--prof-exec --exe", test.pli_filename],
verilator_flags2=["--prof-exec --exe", test.pli_filename, "t/t_gantt_two.cpp"],
# Checks below care about thread count, so use 2 (minimum reasonable)
threads=threads_num,
make_flags=["CPPFLAGS_ADD=\"-DVL_NO_LEGACY -DTEST_USE_THREADS=" + str(threads_num) + "\""])
@ -41,11 +42,11 @@ test.run(cmd=[
"| tee " + gantt_log]) # yapf:disable
if test.vltmt:
test.file_grep(gantt_log, r'Total threads += 2')
test.file_grep(gantt_log, r'Total mtasks += 7')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 2)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6)
else:
test.file_grep(gantt_log, r'Total threads += 1')
test.file_grep(gantt_log, r'Total mtasks += 0')
test.file_grep(gantt_log, r'Total threads += +(\d+)', 1)
test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0)
test.file_grep(gantt_log, r'\|\s+4\s+\|\s+4\.0+\s+\|\s+eval')