diff --git a/Changes b/Changes index 48a4c4e25..8ae8a6269 100644 --- a/Changes +++ b/Changes @@ -14,6 +14,7 @@ Verilator 5.041 devel **Other:** * Add error on zero/negative unpacked dimensions (#1642). [Stefan Wallentowitz] +* Add verilator_gantt profiling of DPI imports (#3084). [Geza Lore] * Add error on non-packed struct randc (#5999). [Seth Pellegrino] * Add configure `--enable-asan` to compile verilator_bin with the address sanitizer (#6404). [Geza Lore] * Add $(LDFLAGS) and $(LIBS) to when building shared libraries (#6425) (#6426). [Ahmed El-Mahmoudy] diff --git a/bin/verilator_gantt b/bin/verilator_gantt index c84533660..29d335065 100755 --- a/bin/verilator_gantt +++ b/bin/verilator_gantt @@ -51,6 +51,7 @@ def read_data(filename): re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None thread = 0 + thread_last_ecpu = None global LongestVcdStrValueLength global ExecGraphTime @@ -109,9 +110,13 @@ def read_data(filename): tick) elif kind == "THREAD_SCHEDULE_WAIT_BEGIN": ecpu = int(re_payload_wait.match(payload).groups()[0]) + thread_last_ecpu = ecpu ThreadScheduleWait[ecpu].append(tick) elif kind == "THREAD_SCHEDULE_WAIT_END": - ecpu = int(re_payload_wait.match(payload).groups()[0]) + # Might have ended on different CPU then we got THREAD_SCHEDULE_WAIT_BEGIN + assert thread_last_ecpu is not None, "THREAD_SCHEDULE_WAIT_END without BEGIN" + ecpu = thread_last_ecpu + thread_last_ecpu = None start = ThreadScheduleWait[ecpu].pop() WaitingTime += tick - start ThreadScheduleWaitIntervals.append((start, tick, ecpu)) diff --git a/src/V3EmitCSyms.cpp b/src/V3EmitCSyms.cpp index c72db2635..b1cb2e428 100644 --- a/src/V3EmitCSyms.cpp +++ b/src/V3EmitCSyms.cpp @@ -446,7 +446,7 @@ void EmitCSyms::emitSymHdr() { } puts("\n// SYMS CLASS (contains all model state)\n"); - puts("class alignas(VL_CACHE_LINE_BYTES)" + EmitCUtil::symClassName() + puts("class alignas(VL_CACHE_LINE_BYTES) " + EmitCUtil::symClassName() + " final : public VerilatedSyms {\n"); ofp()->putsPrivate(false); // public: diff --git a/src/V3Task.cpp b/src/V3Task.cpp index 2520c3236..472004421 100644 --- a/src/V3Task.cpp +++ b/src/V3Task.cpp @@ -681,7 +681,7 @@ class TaskVisitor final : public VNVisitor { } } // First argument is symbol table, then output if a function - const bool needSyms = !refp->taskp()->dpiImport(); + const bool needSyms = !refp->taskp()->dpiImport() || v3Global.opt.profExec(); if (needSyms) ccallp->argTypes("vlSymsp"); if (refp->taskp()->dpiContext()) { @@ -972,7 +972,7 @@ class TaskVisitor final : public VNVisitor { if (rtnvarp) { funcp->addStmtsp(createDpiTemp(rtnvarp, "")); funcp->addStmtsp(createAssignInternalToDpi(rtnvarp, false, tmpSuffixp, "")); - string stmt = "return " + rtnvarp->name(); + string stmt = "return " + rtnvarp->name(); // TODO use AstCReturn? stmt += rtnvarp->basicp()->isDpiPrimitive() ? ";\n" : "[0];\n"; funcp->addStmtsp(new AstCStmt{nodep->fileline(), stmt}); } @@ -1077,6 +1077,12 @@ class TaskVisitor final : public VNVisitor { void bodyDpiImportFunc(AstNodeFTask* nodep, AstVarScope* rtnvscp, AstCFunc* cfuncp, AstCFunc* dpiFuncp) { const char* const tmpSuffixp = V3Task::dpiTemporaryVarSuffix(); + + if (v3Global.opt.profExec()) + cfuncp->addStmtsp( + new AstCStmt{nodep->fileline(), + "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPush(\"dpiimports\");\n"}); + // Convert input/inout arguments to DPI types string args; for (AstNode* stmtp = cfuncp->argsp(); stmtp; stmtp = stmtp->nextp()) { @@ -1162,6 +1168,10 @@ class TaskVisitor final : public VNVisitor { } } } + + if (v3Global.opt.profExec()) + cfuncp->addStmtsp(new AstCStmt{nodep->fileline(), + "VL_EXEC_TRACE_ADD_RECORD(vlSymsp).sectionPop();\n"}); } AstVarScope* getDpiExporTrigger() { @@ -1285,9 +1295,12 @@ class TaskVisitor final : public VNVisitor { if (cfuncp->dpiImportWrapper()) cfuncp->cname(nodep->cname()); + const bool needSyms + = (!nodep->dpiImport() && !nodep->taskPublic()) || v3Global.opt.profExec(); + if (needSyms) cfuncp->argTypes(EmitCUtil::symClassVar()); + if (!nodep->dpiImport() && !nodep->taskPublic()) { // Need symbol table - cfuncp->argTypes(EmitCUtil::symClassVar()); if (cfuncp->name() == "new") { const string stmt = VIdProtect::protect("_ctor_var_reset") + "(vlSymsp);\n"; cfuncp->addInitsp(new AstCStmt{nodep->fileline(), stmt}); diff --git a/test_regress/t/t_gantt.py b/test_regress/t/t_gantt.py index 84ceceb8d..2ade0911a 100755 --- a/test_regress/t/t_gantt.py +++ b/test_regress/t/t_gantt.py @@ -12,10 +12,11 @@ import vltest_bootstrap test.scenarios('vlt_all') -test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles +test.top_filename = "t/t_gantt.v" +test.pli_filename = "t/t_gantt_c.cpp" test.compile( - v_flags2=["--prof-exec"], + verilator_flags2=["--prof-exec", test.pli_filename], # Checks below care about thread count, so use 2 (minimum reasonable) threads=(2 if test.vltmt else 1)) @@ -35,13 +36,13 @@ test.run(cmd=[ ]) if test.vltmt: - test.file_grep(gantt_log, r'Total threads += 2') - test.file_grep(gantt_log, r'Total mtasks += 7') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 2) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6) # Predicted thread utilization should be less than 100% test.file_grep_not(gantt_log, r'Thread utilization =\s*\d\d\d+\.\d+%') else: - test.file_grep(gantt_log, r'Total threads += 1') - test.file_grep(gantt_log, r'Total mtasks += 0') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 1) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0) test.file_grep(gantt_log, r'\|\s+2\s+\|\s+2\.0+\s+\|\s+eval') diff --git a/test_regress/t/t_gantt.v b/test_regress/t/t_gantt.v new file mode 100644 index 000000000..678f4b28e --- /dev/null +++ b/test_regress/t/t_gantt.v @@ -0,0 +1,70 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2021 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 + +module t( + input clk +); + + integer cyc = 0; + wire [63:0] result; + + Test test(/*AUTOINST*/ + // Outputs + .result (result[63:0]), + // Inputs + .clk (clk), + .cyc (cyc)); + + reg [63:0] sum; + + always @ (posedge clk) begin +`ifdef TEST_VERBOSE + $write("[%0t] cyc==%0d result=%x\n", $time, cyc, result); +`endif + cyc <= cyc + 1; + sum <= result ^ {sum[62:0], sum[63] ^ sum[2] ^ sum[0]}; + if (cyc == 0) begin + // Setup + sum <= '0; + end + else if (cyc < 10) begin + sum <= '0; + end + else if (cyc == 99) begin + $write("[%0t] cyc==%0d sum=%x\n", $time, cyc, sum); + // What checksum will we end up with (above print should match) +`define EXPECTED_SUM 64'haf665a181ead5e12 + if (sum !== `EXPECTED_SUM) $stop; + $write("*-* All Finished *-*\n"); + $finish; + end + end + +endmodule + +module Test(/*AUTOARG*/ + // Outputs + result, + // Inputs + clk, cyc + ); + + input clk; + input int cyc; + output reg [63:0] result; + + logic [63:0] adder; + + import "DPI-C" pure function int dpii_return(input int i); + + always @(posedge clk) begin + adder = 0; + for (int i = 0; i < 100000; ++i) + adder += {32'h0, (cyc+i)} ** 3 + {32'h0, dpii_return(1)}; + + result <= adder; + end +endmodule diff --git a/test_regress/t/t_gantt_c.cpp b/test_regress/t/t_gantt_c.cpp new file mode 100644 index 000000000..670b78ebc --- /dev/null +++ b/test_regress/t/t_gantt_c.cpp @@ -0,0 +1,12 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2025 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 +// +//************************************************************************* + +extern "C" { +int dpii_return(int i) { return i; } +} diff --git a/test_regress/t/t_gantt_hier.py b/test_regress/t/t_gantt_hier.py index fb6c52c6e..ea5237f6b 100755 --- a/test_regress/t/t_gantt_hier.py +++ b/test_regress/t/t_gantt_hier.py @@ -12,10 +12,11 @@ import vltest_bootstrap test.scenarios('vlt_all') -test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles +test.top_filename = "t/t_gantt.v" +test.pli_filename = "t/t_gantt_c.cpp" test.compile( - v_flags2=["--prof-exec", "--hierarchical"], + verilator_flags2=["--prof-exec", "--hierarchical", test.pli_filename], # Checks below care about thread count, so use 2 (minimum reasonable) threads=(2 if test.vltmt else 1)) @@ -35,13 +36,13 @@ test.run(cmd=[ ]) if test.vltmt: - test.file_grep(gantt_log, r'Total threads += 2') - test.file_grep(gantt_log, r'Total mtasks += 11') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 2) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6) # Predicted thread utilization should be less than 100% test.file_grep_not(gantt_log, r'Thread utilization =\s*\d\d\d+\.\d+%') else: - test.file_grep(gantt_log, r'Total threads += 1') - test.file_grep(gantt_log, r'Total mtasks += 0') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 1) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0) test.file_grep(gantt_log, r'\|\s+2\s+\|\s+2\.0+\s+\|\s+eval') diff --git a/test_regress/t/t_gantt_numa.py b/test_regress/t/t_gantt_numa.py index db25d0d87..7828e3e1f 100755 --- a/test_regress/t/t_gantt_numa.py +++ b/test_regress/t/t_gantt_numa.py @@ -12,10 +12,11 @@ import vltest_bootstrap test.scenarios('vltmt') -test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles +test.top_filename = "t/t_gantt.v" +test.pli_filename = "t/t_gantt_c.cpp" test.compile( - v_flags2=["--prof-exec"], + verilator_flags2=["--prof-exec", test.pli_filename], # Checks below care about thread count threads=4) diff --git a/test_regress/t/t_gantt_two.py b/test_regress/t/t_gantt_two.py index 4ba41a8d4..3ee322182 100755 --- a/test_regress/t/t_gantt_two.py +++ b/test_regress/t/t_gantt_two.py @@ -12,14 +12,15 @@ import vltest_bootstrap test.scenarios('vlt_all') -test.top_filename = "t/t_gen_alw.v" # Any, as long as runs a few cycles +test.top_filename = "t/t_gantt.v" +test.pli_filename = "t/t_gantt_c.cpp" threads_num = (2 if test.vltmt else 1) test.compile( make_top_shell=False, make_main=False, - v_flags2=["--prof-exec --exe", test.pli_filename], + verilator_flags2=["--prof-exec --exe", test.pli_filename, "t/t_gantt_two.cpp"], # Checks below care about thread count, so use 2 (minimum reasonable) threads=threads_num, make_flags=["CPPFLAGS_ADD=\"-DVL_NO_LEGACY -DTEST_USE_THREADS=" + str(threads_num) + "\""]) @@ -41,11 +42,11 @@ test.run(cmd=[ "| tee " + gantt_log]) # yapf:disable if test.vltmt: - test.file_grep(gantt_log, r'Total threads += 2') - test.file_grep(gantt_log, r'Total mtasks += 7') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 2) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 6) else: - test.file_grep(gantt_log, r'Total threads += 1') - test.file_grep(gantt_log, r'Total mtasks += 0') + test.file_grep(gantt_log, r'Total threads += +(\d+)', 1) + test.file_grep(gantt_log, r'Total mtasks += +(\d+)', 0) test.file_grep(gantt_log, r'\|\s+4\s+\|\s+4\.0+\s+\|\s+eval')