Optimize inlining small C functions and add `-inline-cfuncs` (#6815)

This commit is contained in:
Jose Drowne 2025-12-21 13:14:50 -05:00 committed by GitHub
parent e6877e83fd
commit c0a0f0dab9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 509 additions and 7 deletions

View File

@ -408,6 +408,8 @@ detailed descriptions of these arguments.
-I<dir> Directory to search for includes
--if-depth <value> Tune IFDEPTH warning
+incdir+<dir> Directory to search for includes
--inline-cfuncs <value> Inline CFuncs with <=value nodes (0=off)
--inline-cfuncs-product <value> Inline CFuncs if size*calls <= value
--inline-mult <value> Tune module inlining
--instr-count-dpi <value> Assumed dynamic instruction count of DPI imports
-j <jobs> Parallelism for --build-jobs/--verilate-jobs

View File

@ -125,6 +125,7 @@ John Wehle
Jonathan Drolet
Jonathan Schröter
Jordan McConnon
Jose Drowne
Jose Loyola
Josep Sans
Joseph Nwabueze

View File

@ -867,6 +867,29 @@ Summary:
compatibility and is not recommended usage as this is not supported by
some third-party tools.
.. option:: --inline-cfuncs <value>
Inline small CFunc calls directly into their callers when the function
has at most <value> nodes. This reduces function call overhead when
:vlopt:`--output-split-cfuncs` places functions in separate compilation
units that the C++ compiler cannot inline.
Set to 0 to disable this optimization. The default is 20.
This optimization is automatically disabled when :vlopt:`--prof-cfuncs`
or :vlopt:`--trace` is used.
.. option:: --inline-cfuncs-product <value>
Tune the inlining of CFunc calls for larger functions. When a function
is too large to always inline (exceeds :vlopt:`--inline-cfuncs` threshold),
it may still be inlined if the function size multiplied by the number of
call sites is at most <value>.
This allows functions that are called only once or twice to be inlined
even if they exceed the small function threshold. Set to 0 to only inline
functions below the :vlopt:`--inline-cfuncs` threshold. The default is 200.
.. option:: --inline-mult <value>
Tune the inlining of modules. The default value of 2000 specifies that

View File

@ -114,6 +114,7 @@ set(HEADERS
V3Hasher.h
V3HierBlock.h
V3Inline.h
V3InlineCFuncs.h
V3Inst.h
V3InstrCount.h
V3Interface.h
@ -287,6 +288,7 @@ set(COMMON_SOURCES
V3Hasher.cpp
V3HierBlock.cpp
V3Inline.cpp
V3InlineCFuncs.cpp
V3Inst.cpp
V3InstrCount.cpp
V3Interface.cpp

View File

@ -284,6 +284,7 @@ RAW_OBJS_PCH_ASTNOMT = \
V3Gate.o \
V3HierBlock.o \
V3Inline.o \
V3InlineCFuncs.o \
V3Inst.o \
V3InstrCount.o \
V3Interface.o \

269
src/V3InlineCFuncs.cpp Normal file
View File

@ -0,0 +1,269 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Inline small CFuncs into their callers
//
// Code available from: https://verilator.org
//
//*************************************************************************
//
// Copyright 2003-2025 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//*************************************************************************
// V3InlineCFuncs's Transformations:
//
// For each CCall to a small CFunc:
// - Check if function is eligible for inlining (small enough, same scope)
// - Clone local variables with unique names to avoid collisions
// - Replace CCall with cloned function body statements
//
// Two tunables control inlining:
// --inline-cfuncs <n> : Always inline if size <= n (default 20)
// --inline-cfuncs-product <n> : Also inline if size * call_count <= n (default 200)
//
//*************************************************************************
#include "V3PchAstNoMT.h" // VL_MT_DISABLED_CODE_UNIT
#include "V3InlineCFuncs.h"
#include "V3AstUserAllocator.h"
#include "V3Stats.h"
#include <map>
#include <vector>
VL_DEFINE_DEBUG_FUNCTIONS;
//######################################################################
// Helper visitor to check if a CFunc contains C statements
// Uses clearOptimizable pattern for debugging
class CFuncInlineCheckVisitor final : public VNVisitorConst {
// STATE
bool m_optimizable = true; // True if function can be inlined
string m_whyNot; // Reason why not optimizable
AstNode* m_whyNotNodep = nullptr; // Node that caused non-optimizable
// METHODS
void clearOptimizable(AstNode* nodep, const string& why) {
if (m_optimizable) {
m_optimizable = false;
m_whyNot = why;
m_whyNotNodep = nodep;
UINFO(9, "CFunc not inlineable: " << why);
if (nodep) UINFO(9, ": " << nodep);
UINFO(9, endl);
}
}
// VISITORS
void visit(AstCStmt* nodep) override { clearOptimizable(nodep, "contains AstCStmt"); }
void visit(AstCExpr* nodep) override { clearOptimizable(nodep, "contains AstCExpr"); }
void visit(AstCStmtUser* nodep) override { clearOptimizable(nodep, "contains AstCStmtUser"); }
void visit(AstCExprUser* nodep) override { clearOptimizable(nodep, "contains AstCExprUser"); }
void visit(AstNode* nodep) override { iterateChildrenConst(nodep); }
public:
// CONSTRUCTORS
explicit CFuncInlineCheckVisitor(AstCFunc* cfuncp) { iterateConst(cfuncp); }
// ACCESSORS
bool optimizable() const { return m_optimizable; }
string whyNot() const { return m_whyNot; }
AstNode* whyNotNodep() const { return m_whyNotNodep; }
};
//######################################################################
class InlineCFuncsVisitor final : public VNVisitor {
// NODE STATE
// AstCFunc::user1() -> vector of AstCCall* pointing to this function
// AstCFunc::user2() -> bool: true if checked for C statements
// AstCFunc::user3() -> bool: true if contains C statements (not inlineable)
const VNUser1InUse m_user1InUse;
const VNUser2InUse m_user2InUse;
const VNUser3InUse m_user3InUse;
AstUser1Allocator<AstCFunc, std::vector<AstCCall*>> m_callSites;
// STATE
VDouble0 m_statInlined; // Statistic tracking
const int m_threshold1; // Size threshold: always inline if size <= this
const int m_threshold2; // Product threshold: inline if size * calls <= this
AstCFunc* m_callerFuncp = nullptr; // Current caller function
// Tuples of (StmtExpr to replace, CFunc to inline from, caller func for vars)
std::vector<std::tuple<AstStmtExpr*, AstCFunc*, AstCFunc*>> m_toInline;
// METHODS
// Check if a function contains any $c() calls (user or internal)
// Results are cached in user2/user3 for efficiency
bool containsCStatements(AstCFunc* cfuncp) {
if (!cfuncp->user2()) {
// Not yet checked - run the check visitor
cfuncp->user2(true); // Mark as checked
const CFuncInlineCheckVisitor checker{cfuncp};
cfuncp->user3(!checker.optimizable()); // Store result (true = contains C stmts)
}
return cfuncp->user3();
}
// Check if a function is eligible for inlining into caller
bool isInlineable(AstCFunc* callerp, AstCFunc* cfuncp) {
// Must be in the same scope (same class) to access the same members
if (callerp->scopep() != cfuncp->scopep()) return false;
// Check for $c() calls that might use 'this'
if (containsCStatements(cfuncp)) return false;
// Check it's a void function (not a coroutine)
if (cfuncp->rtnTypeVoid() != "void") return false;
// Don't inline functions marked dontCombine (e.g. trace, entryPoint)
if (cfuncp->dontCombine()) return false;
// Don't inline entry point functions
if (cfuncp->entryPoint()) return false;
// Must have statements to inline
if (!cfuncp->stmtsp()) return false;
// Check size thresholds
const size_t funcSize = cfuncp->nodeCount();
// Always inline if small enough
if (funcSize <= static_cast<size_t>(m_threshold1)) return true;
// Also inline if size * call_count is reasonable
const size_t callCount = m_callSites(cfuncp).size();
if (callCount > 0 && funcSize * callCount <= static_cast<size_t>(m_threshold2)) {
return true;
}
return false;
}
// VISITORS
void visit(AstCCall* nodep) override {
iterateChildren(nodep);
AstCFunc* const cfuncp = nodep->funcp();
if (!cfuncp) return;
// Track call site for call counting
m_callSites(cfuncp).emplace_back(nodep);
}
void visit(AstCFunc* nodep) override {
VL_RESTORER(m_callerFuncp);
m_callerFuncp = nodep;
iterateChildren(nodep);
}
void visit(AstNodeModule* nodep) override {
// Process per module for better cache behavior
m_toInline.clear();
// Phase 1: Collect call sites within this module
iterateChildren(nodep);
// Phase 2: Determine which calls to inline
collectInlineCandidates(nodep);
// Phase 3: Perform inlining for this module
doInlining();
}
void visit(AstNode* nodep) override { iterateChildren(nodep); }
// Collect calls that should be inlined within this module
void collectInlineCandidates(AstNodeModule* modp) {
for (AstNode* stmtp = modp->stmtsp(); stmtp; stmtp = stmtp->nextp()) {
AstCFunc* const callerp = VN_CAST(stmtp, CFunc);
if (!callerp) continue;
callerp->foreach([&](AstCCall* callp) {
AstCFunc* const cfuncp = callp->funcp();
if (!cfuncp) return;
if (!isInlineable(callerp, cfuncp)) return;
// Walk up to find the containing StmtExpr
AstNode* stmtNodep = callp;
while (stmtNodep && !VN_IS(stmtNodep, StmtExpr) && !VN_IS(stmtNodep, CFunc)) {
stmtNodep = stmtNodep->backp();
}
AstStmtExpr* const stmtExprp = VN_CAST(stmtNodep, StmtExpr);
if (!stmtExprp) return;
m_toInline.emplace_back(stmtExprp, cfuncp, callerp);
});
}
}
// Perform the actual inlining after iteration is complete
void doInlining() {
for (const auto& tuple : m_toInline) {
AstStmtExpr* const stmtExprp = std::get<0>(tuple);
AstCFunc* const cfuncp = std::get<1>(tuple);
AstCFunc* const callerp = std::get<2>(tuple);
UINFO(6, "Inlining CFunc " << cfuncp->name() << " into " << callerp->name() << endl);
++m_statInlined;
// Clone local variables with unique names to avoid collisions
std::map<AstVar*, AstVar*> varMap;
for (AstVar* varp = cfuncp->varsp(); varp; varp = VN_AS(varp->nextp(), Var)) {
const string newName = "__Vinline_" + cfuncp->name() + "_" + varp->name();
AstVar* const newVarp = varp->cloneTree(false);
newVarp->name(newName);
callerp->addVarsp(newVarp);
varMap[varp] = newVarp;
}
// Clone the function body
AstNode* const bodyp = cfuncp->stmtsp()->cloneTree(true);
// Retarget variable references to the cloned variables
// Must iterate all sibling statements, not just the first
if (!varMap.empty()) {
for (AstNode* stmtp = bodyp; stmtp; stmtp = stmtp->nextp()) {
stmtp->foreach([&](AstVarRef* refp) {
auto it = varMap.find(refp->varp());
if (it != varMap.end()) refp->varp(it->second);
});
}
}
// Replace the statement with the inlined body
stmtExprp->addNextHere(bodyp);
VL_DO_DANGLING(stmtExprp->unlinkFrBack()->deleteTree(), stmtExprp);
}
}
public:
// CONSTRUCTORS
explicit InlineCFuncsVisitor(AstNetlist* nodep)
: m_threshold1{v3Global.opt.inlineCFuncs()}
, m_threshold2{v3Global.opt.inlineCFuncsProduct()} {
// Don't inline when profiling or tracing
if (v3Global.opt.profCFuncs() || v3Global.opt.trace()) return;
// Process modules one at a time for better cache behavior
iterateAndNextNull(nodep->modulesp());
}
~InlineCFuncsVisitor() override {
V3Stats::addStat("Optimizations, Inlined CFuncs", m_statInlined);
}
};
//######################################################################
// InlineCFuncs class functions
void V3InlineCFuncs::inlineAll(AstNetlist* nodep) {
UINFO(2, __FUNCTION__ << ":");
{ InlineCFuncsVisitor{nodep}; } // Destruct before checking
V3Global::dumpCheckGlobalTree("inlinecfuncs", 0, dumpTreeEitherLevel() >= 6);
}

30
src/V3InlineCFuncs.h Normal file
View File

@ -0,0 +1,30 @@
// -*- mode: C++; c-file-style: "cc-mode" -*-
//*************************************************************************
// DESCRIPTION: Verilator: Inline small CFuncs into their callers
//
// Code available from: https://verilator.org
//
//*************************************************************************
//
// Copyright 2003-2025 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//*************************************************************************
#ifndef VERILATOR_V3INLINECFUNCS_H_
#define VERILATOR_V3INLINECFUNCS_H_
#include "config_build.h"
#include "verilatedos.h"
class AstNetlist;
class V3InlineCFuncs final {
public:
static void inlineAll(AstNetlist* nodep) VL_MT_DISABLED;
};
#endif // Guard

View File

@ -1528,6 +1528,8 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
}).notForRerun();
DECL_OPTION("-if-depth", Set, &m_ifDepth);
DECL_OPTION("-ignc", OnOff, &m_ignc).undocumented();
DECL_OPTION("-inline-cfuncs", Set, &m_inlineCFuncs);
DECL_OPTION("-inline-cfuncs-product", Set, &m_inlineCFuncsProduct);
DECL_OPTION("-inline-mult", Set, &m_inlineMult);
DECL_OPTION("-instr-count-dpi", CbVal, [this, fl](int val) {
m_instrCountDpi = val;

View File

@ -319,6 +319,8 @@ private:
int m_hierChild = 0; // main switch: --hierarchical-child
int m_hierThreads = 0; // main switch: --hierarchical-threads
int m_ifDepth = 0; // main switch: --if-depth
int m_inlineCFuncs = 20; // main switch: --inline-cfuncs
int m_inlineCFuncsProduct = 200; // main switch: --inline-cfuncs-product
int m_inlineMult = 2000; // main switch: --inline-mult
int m_instrCountDpi = 200; // main switch: --instr-count-dpi
bool m_jsonEditNums = true; // main switch: --no-json-edit-nums
@ -595,6 +597,8 @@ public:
int expandLimit() const { return m_expandLimit; }
int gateStmts() const { return m_gateStmts; }
int ifDepth() const { return m_ifDepth; }
int inlineCFuncs() const { return m_inlineCFuncs; }
int inlineCFuncsProduct() const { return m_inlineCFuncsProduct; }
int inlineMult() const { return m_inlineMult; }
int instrCountDpi() const { return m_instrCountDpi; }
int localizeMaxSize() const { return m_localizeMaxSize; }

View File

@ -63,6 +63,7 @@
#include "V3Graph.h"
#include "V3HierBlock.h"
#include "V3Inline.h"
#include "V3InlineCFuncs.h"
#include "V3Inst.h"
#include "V3Interface.h"
#include "V3LibMap.h"
@ -565,6 +566,11 @@ static void process() {
V3Reloop::reloopAll(v3Global.rootp());
}
if (v3Global.opt.inlineCFuncs()) {
// Inline small CFuncs to reduce function call overhead
V3InlineCFuncs::inlineAll(v3Global.rootp());
}
// Fix very deep expressions
// Mark evaluation functions as member functions, if needed.
V3Depth::depthAll(v3Global.rootp());

View File

@ -11,7 +11,7 @@ import vltest_bootstrap
test.scenarios('simulator_st')
test.compile(verilator_flags2=["--stats"])
test.compile(verilator_flags2=["--stats", "--inline-cfuncs", "0"])
test.execute(expect_filename=test.golden_filename)

View File

@ -14,7 +14,8 @@ test.top_filename = "t/t_inst_tree.v"
default_vltmt_threads = test.get_default_vltmt_threads
test.compile(
verilator_flags2=['--stats', test.t_dir + "/" + test.name + ".vlt"],
# Disable --inline-cfuncs so functions exist to be combined
verilator_flags2=['--stats', '--inline-cfuncs', '0', test.t_dir + "/" + test.name + ".vlt"],
# Force 3 threads even if we have fewer cores
threads=(default_vltmt_threads if test.vltmt else 1))

View File

@ -14,7 +14,9 @@ test.top_filename = "t/t_enum_type_methods.v"
out_filename = test.obj_dir + "/V" + test.name + ".tree.json"
test.compile(verilator_flags2=['--no-std', '--debug-check', '--no-json-edit-nums', '--flatten'],
test.compile(verilator_flags2=[
'--no-std', '--debug-check', '--no-json-edit-nums', '--flatten', '--inline-cfuncs', '0'
],
verilator_make_gmake=False,
make_top_shell=False,
make_main=False)

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2024 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
# Use --output-split-cfuncs to create small functions that can be inlined
# Also test --inline-cfuncs-product option
test.compile(verilator_flags2=[
"--stats", "--binary", "--output-split-cfuncs", "1", "--inline-cfuncs-product", "200"
])
# Verify inlining happened with exact count
test.file_grep(test.stats, r'Optimizations, Inlined CFuncs\s+(\d+)', 39)
test.execute()
test.passes()

View File

@ -0,0 +1,58 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain, for
// any use, without warranty, 2024 by Wilson Snyder.
// SPDX-License-Identifier: CC0-1.0
// Test module designed to generate multiple small CFuncs that can be inlined
// Uses generate to create multiple sub-module instances
module t (/*AUTOARG*/
// Inputs
clk
);
input clk;
integer cyc = 0;
parameter CNT = 8;
wire [31:0] w [CNT:0];
reg [31:0] w0;
assign w[0] = w0;
// Generate multiple sub-modules - each creates CFuncs that can be inlined
generate
for (genvar g=0; g<CNT; g++) begin : gen_sub
sub sub_inst (.clk(clk), .i(w[g]), .z(w[g+1]));
end
endgenerate
// Test loop
always @ (posedge clk) begin
cyc <= cyc + 1;
if (cyc==0) begin
w0 <= 32'h10;
end
else if (cyc==10) begin
// Each sub adds 1, so final value is 0x10 + 8 = 0x18
if (w[CNT] !== 32'h18) begin
$write("%%Error: w[CNT]=%0x, expected 0x18\n", w[CNT]);
$stop;
end
$write("*-* All Finished *-*\n");
$finish;
end
end
endmodule
// Small sub-module that generates inlineable CFuncs
module sub (input clk, input [31:0] i, output reg [31:0] z);
reg [7:0] local_a;
reg [7:0] local_b;
always @(posedge clk) begin
local_a <= i[7:0];
local_b <= 8'd1;
z <= i + {24'b0, local_b};
end
endmodule

View File

@ -0,0 +1,23 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2024 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
test.top_filename = "t/t_opt_inline_cfuncs.v"
# Disable inlining with --inline-cfuncs 0
test.compile(verilator_flags2=["--stats", "--binary", "--inline-cfuncs", "0"])
# Verify inlining did NOT happen (stat doesn't exist when pass is skipped)
test.file_grep_not(test.stats, r'Optimizations, Inlined CFuncs\s+[1-9]')
test.execute()
test.passes()

View File

@ -0,0 +1,25 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# Copyright 2024 by Wilson Snyder. This program is free software; you
# can redistribute it and/or modify it under the terms of either the GNU
# Lesser General Public License Version 3 or the Perl Artistic License
# Version 2.0.
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
# Use thresholds that guarantee rejection to test the "return false" path in isInlineable()
# --inline-cfuncs 1: pass still runs (not skipped)
# --inline-cfuncs-product 0: guarantees all functions rejected (node_count * call_count > 0 always)
test.compile(verilator_flags2=[
"--stats", "--binary", "--inline-cfuncs", "1", "--inline-cfuncs-product", "0"
])
test.file_grep(test.stats, r'Optimizations, Inlined CFuncs\s+(\d+)', 0)
test.execute()
test.passes()

View File

@ -0,0 +1,27 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain, for
// any use, without warranty, 2024 by Wilson Snyder.
// SPDX-License-Identifier: CC0-1.0
// Test module to exercise threshold checking in CFunc inlining
// With low thresholds, these functions should NOT be inlined
module t;
reg [31:0] a, b, c, d, e, f, g, h;
initial begin
// Multiple operations to create larger CFuncs
a = 32'd1;
b = 32'd2;
c = a + b;
d = c * 2;
e = d - 1;
f = e + a;
g = f * b;
h = g + c + d + e + f;
if (h != 32'd32) $stop;
$write("*-* All Finished *-*\n");
$finish;
end
endmodule

View File

@ -12,7 +12,8 @@ import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = "t/t_timing_sched.v"
test.compile(verilator_flags2=["--exe --main --timing"])
test.compile(
verilator_flags2=["--binary", "--timing", "--inline-cfuncs", "0", "-CFLAGS", "-DVL_DEBUG"])
test.execute(all_run_flags=["+verilator+debug"])

View File

@ -12,7 +12,8 @@ import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = "t/t_timing_class.v"
test.compile(verilator_flags2=["--exe --main --timing"])
# Disable --inline-cfuncs so debug traces show all function entries
test.compile(verilator_flags2=["--exe --main --timing --inline-cfuncs 0"])
test.execute(all_run_flags=["+verilator+debug"])

View File

@ -37,7 +37,6 @@ internalsDump:
-V{t#,#}+ Vt_verilated_debug___024root___eval_phase__nba
-V{t#,#}+ Vt_verilated_debug___024root___trigger_anySet__act
-V{t#,#}+ Vt_verilated_debug___024root___eval_nba
-V{t#,#}+ Vt_verilated_debug___024root___nba_sequent__TOP__0
*-* All Finished *-*
-V{t#,#}+ Vt_verilated_debug___024root___trigger_clear__act
-V{t#,#}+ Vt_verilated_debug___024root___eval_phase__act

View File

@ -14,7 +14,7 @@ test.top_filename = "t/t_enum_type_methods.v"
out_filename = test.obj_dir + "/V" + test.name + ".xml"
test.compile(verilator_flags2=['--no-std', '--debug-check', '--flatten'],
test.compile(verilator_flags2=['--no-std', '--debug-check', '--flatten', '--inline-cfuncs', '0'],
verilator_make_gmake=False,
make_top_shell=False,
make_main=False)