Optimize V3Gate inlining heuristic (#7716)

V3Gate used to inline too many expensive operations. One particularly
bad example is inlining `{<<{wide}}` (bit-reverse of a wide signal),
which is a single input node, but is quite expensive to compute, which
we always used to inline.

Change the heuristic to only inline single input nodes if they are not
wide, or a cheap wide operation, otherwise treat them the same as
multi-input ops and inline them only if they are used no more than once.
This commit is contained in:
Geza Lore 2026-06-11 20:59:18 +01:00 committed by GitHub
parent 0ee5cbf502
commit 0ee25038ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 52 additions and 170 deletions

View File

@ -543,89 +543,62 @@ class GateInline final {
// Logic block with pending substitutions are stored in this map, together with their ordinal
std::unordered_map<AstNode*, size_t> m_hasPending;
size_t m_statInlined = 0; // Statistic tracking - signals inlined
size_t m_statRefs = 0; // Statistic tracking
size_t m_statExcluded = 0; // Statistic tracking
size_t m_statNotInlined = 0; // Statistic tracking - signals not inlined due to cost
size_t m_statRefs = 0; // Statistic tracking - number of input variable references replaced
// METHODS
static bool isCheapWide(const AstNodeExpr* exprp) {
static bool isCheap(const AstNodeExpr* exprp) {
// Constant is cheap
if (VN_IS(exprp, Const)) return true;
// Variable reference is cheap
if (VN_IS(exprp, NodeVarRef)) return true;
// AstSel is cheap if the fromp is cheap, and not a wide needing bit swizzling
if (const AstSel* const selp = VN_CAST(exprp, Sel)) {
if (!isCheap(selp->fromp())) return false;
if (!selp->isWide()) return true;
if (!VN_IS(selp->lsbp(), Const)) return false;
if (selp->lsbConst() % VL_EDATASIZE != 0) return false;
exprp = selp->fromp();
return true;
}
if (const AstArraySel* const aselp = VN_CAST(exprp, ArraySel)) exprp = aselp->fromp();
return VN_IS(exprp, Const) || VN_IS(exprp, NodeVarRef);
}
static bool excludedWide(GateVarVertex* const vVtxp, const AstNodeExpr* const rhsp) {
// Handle wides with logic drivers that are too wide for V3Expand.
if (!vVtxp->varScp()->isWide() //
|| vVtxp->varScp()->widthWords() <= v3Global.opt.expandLimit() //
|| vVtxp->inEmpty() //
|| isCheapWide(rhsp))
return false;
const GateLogicVertex* const lVtxp
= vVtxp->inEdges().frontp()->fromp()->as<GateLogicVertex>();
// Exclude from inlining variables READ multiple times.
// To decouple actives thus simplifying scheduling, exclude only those
// VarRefs that are referenced under the same active as they were assigned.
if (const AstActive* const primaryActivep = lVtxp->activep()) {
size_t reads = 0;
for (const V3GraphEdge& edge : vVtxp->outEdges()) {
const GateLogicVertex* const lvp = edge.top()->as<GateLogicVertex>();
if (lvp->activep() != primaryActivep) continue;
reads += edge.weight();
if (reads > 1) return true;
}
// AstArraySel is cheap if the fromp is cheap
if (const AstArraySel* const aselp = VN_CAST(exprp, ArraySel)) {
return isCheap(aselp->fromp());
}
// Otherwise it is not cheap
return false;
}
bool shouldInline(GateVarVertex* vVtxp, GateLogicVertex* lVtxp, size_t nReads,
AstNodeExpr* substp, bool allowMultiIn) {
AstVarScope* const vscp = vVtxp->varScp();
// Always inline constants
if (VN_IS(substp, Const)) return true;
// Don't inline non-constant static initializers
// Don't inline non-constant static initializers - these are scheduled differently
if (lVtxp->staticInit()) return false;
// Inline simple variable references
if (VN_IS(substp, VarRef)) return true;
// Only inline arrays if a simple variable or constant
if (VN_IS(vscp->dtypep()->skipRefp(), UnpackArrayDType)) return false;
// Inline constant array selects
if (VN_IS(substp, ArraySel) && nReads <= 1) return true;
// Don't inline expensive wide operations
if (excludedWide(vVtxp, substp)) {
++m_statExcluded;
UINFO(9, "Gate inline exclude '" << vVtxp->name() << "'");
vVtxp->clearReducible("Excluded wide"); // Check once.
return false;
}
if (nReads == 0) {
// Reads no variables, likely unfolded constant expression
return true;
} else if (nReads == 1) {
// Reads one variable
return true;
} else {
// Reads more two or more variables
if (!allowMultiIn) return false;
// Do it if not used, or used only once, ignoring slow code
int n = 0;
for (V3GraphEdge& edge : vVtxp->outEdges()) {
const GateLogicVertex* const dstVtxp = edge.top()->as<GateLogicVertex>();
// Ignore slow code, or if the destination is not used
if (dstVtxp->slow()) continue;
if (dstVtxp->outEmpty() && !dstVtxp->consumed()) continue;
n += edge.weight();
if (n > 1) return false;
if (VN_IS(vVtxp->varScp()->dtypep()->skipRefp(), UnpackArrayDType)) return false;
// Inline if reads no variables - unfolded constant expression, nullary builtin e.g.: $time
if (nReads == 0) return true;
// If it reads one variable, inline if not wide, or if cheap
if (nReads == 1 && (!substp->isWide() || isCheap(substp))) return true;
// Don't inline on first round if reads more than one variable
if (nReads > 1 && !allowMultiIn) return false;
// Reads multiple variables, or is expensive to compute.
// Inline if used only once, ignoring slow code, or dead code that can be deleted.
int n = 0;
for (V3GraphEdge& edge : vVtxp->outEdges()) {
const GateLogicVertex* const dstVtxp = edge.top()->as<GateLogicVertex>();
// Ignore slow code, or if the destination is not used
if (dstVtxp->slow()) continue;
if (dstVtxp->outEmpty() && !dstVtxp->consumed()) continue;
n += edge.weight();
if (n > 1) {
++m_statNotInlined;
return false;
}
return true;
}
return true;
}
void recordSubstitution(AstVarScope* vscp, AstNodeExpr* substp, AstNode* logicp) {
@ -724,7 +697,7 @@ class GateInline final {
if (!okVisitor.varAssigned(vVtxp->varScp())) continue;
// Expression we are considering to substitute with
AstNodeExpr* const substp = okVisitor.substitutionp();
AstNodeExpr* const substp = V3Const::constifyEdit(okVisitor.substitutionp());
// Number of variables read by the substitution
const size_t nReads = okVisitor.readVscps().size();
@ -832,9 +805,9 @@ class GateInline final {
}
~GateInline() {
V3Stats::addStat("Optimizations, Gate sigs deleted", m_statInlined);
V3Stats::addStat("Optimizations, Gate inputs replaced", m_statRefs);
V3Stats::addStat("Optimizations, Gate excluded wide expressions", m_statExcluded);
V3Stats::addStat("Optimizations, Gate signals inlined", m_statInlined);
V3Stats::addStat("Optimizations, Gate signals not inlined due to cost", m_statNotInlined);
V3Stats::addStat("Optimizations, Gate reads replaced", m_statRefs);
}
public:

View File

@ -48,6 +48,6 @@ test.compile(
test.execute()
# Must be <<9000 above to prove this worked
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 8550)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 8550)
test.passes()

View File

@ -13,7 +13,7 @@ test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 2)
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate signals not inlined due to cost\s+(\d+)', 4)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 0)
test.passes()

View File

@ -13,7 +13,7 @@ test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5', '-fno-dfg'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 1)
test.file_grep(test.stats, r'Optimizations, Gate signals not inlined due to cost\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 1)
test.passes()

View File

@ -13,7 +13,7 @@ test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 2)
test.file_grep(test.stats, r'Optimizations, Gate signals not inlined due to cost\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 2)
test.passes()

View File

@ -1,18 +0,0 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2024 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 0)
test.passes()

View File

@ -1,34 +0,0 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain
// SPDX-FileCopyrightText: 2024 Antmicro
// SPDX-License-Identifier: CC0-1.0
localparam N = 256; // Wider than expand limit.
module t (
input wire [N-1:0] i,
output wire [N-1:0] o
);
// Do not exclude from inlining wides referenced in different scope.
wire [N-1:0] wide = N ~^ i;
sub sub (
i,
wide,
o
);
endmodule
module sub (
input wire [N-1:0] i,
input wire [N-1:0] wide,
output logic [N-1:0] o
);
initial begin
for (integer n = 0; n < N; ++n) begin
o[n] = i[N-1-n] | wide[N-1-n];
end
end
endmodule

View File

@ -13,8 +13,8 @@ test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5', '-fno-var-split'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 1)
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 1)
test.file_grep(test.stats, r'Optimizations, Gate signals not inlined due to cost\s+(\d+)', 2)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 1)
test.file_grep(test.stats, r'SplitVar, packed variables split automatically\s+(\d+)', 0)
test.passes()

View File

@ -1,18 +0,0 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2024 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 0)
test.passes()

View File

@ -1,21 +0,0 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// This file ONLY is placed under the Creative Commons Public Domain
// SPDX-FileCopyrightText: 2024 Antmicro
// SPDX-License-Identifier: CC0-1.0
localparam N = 65; // Wide but narrower than expand limit
module t (
input wire [N-1:0] i,
output wire [N-1:0] o
);
// Do not exclude from inlining wides small enough to be handled by
// V3Expand.
wire [65:0] wide_small = N << i * i / N;
for (genvar n = 0; n < N; ++n) begin
assign o[n] = i[n] ^ wide_small[n];
end
endmodule

View File

@ -13,7 +13,7 @@ test.scenarios('vlt')
test.lint(verilator_flags2=['--stats', '--expand-limit 5'])
test.file_grep(test.stats, r'Optimizations, Gate excluded wide expressions\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate sigs deleted\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate signals not inlined due to cost\s+(\d+)', 0)
test.file_grep(test.stats, r'Optimizations, Gate signals inlined\s+(\d+)', 0)
test.passes()