Optimize bit-scan loops into $mostsetbitp1 / $countones (#7822)

Recognize the common single-bit scan loop idioms in V3Unroll (before it
unrolls) and lower them to bit-reduction primitives, replacing a literal
W-iteration loop with one intrinsic-backed expression:

  target=0; for (i=0;i<W;i++) if (vec[i]) target = i + 1;      -> $mostsetbitp1(vec)
  target=0; for (i=0;i<W;i++) if (vec[i]) target = target + 1; -> $countones(vec)

The leading-one form lowers to a new AstMostSetBitP1 node, emitted as
VL_MOSTSETBITP1_{I,Q,W}; those runtime helpers now use __builtin_clz where
available (same pattern as VL_REDXOR's __builtin_parity), with the existing
bit scan as fallback.  The count-ones form reuses AstCountOnes ($countones,
popcount); as the DFG requires a 32-bit countones result it is built at 32
bits and narrowed to the accumulator width with a select.

Matching is structural to stay sound: the index must start at 0, increment
by exactly 1, and scan all W==width(vec) bits via a single 1-bit select of a
distinct vector, with the target pre-zeroed and no else branch.  The loop
bound is accepted as a strict ascending 'idx < W' written either way and
signed or unsigned (Gt/GtS/Lt/LtS).  Gated by -fbit-scan-loops (on at -O).

Adds t_bit_scan_loops (I/Q/W, count-ones and unsigned-index positives;
step-2, start-1, idx*2+1, vec[idx+1], target=idx and W!=width negatives, all
self-checked and asserted via --stats not to lower) plus t_bit_scan_loops_off
for the disable flag.

Motivated by a transformer inference design whose 80-bit leading-one detector
ran every cycle (~37% of runtime); the lowering is worth ~39% there.
This commit is contained in:
Thomas Santerre 2026-06-24 05:43:05 -04:00 committed by GitHub
parent 7752625f49
commit bd6b9161dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 472 additions and 7 deletions

View File

@ -273,6 +273,7 @@ Teng Huang
Thomas Aldrian
Thomas Brown
Thomas Dybdahl Ahle
Thomas Santerre
Tim Hutt
Tim Snyder
Tobias Jensen

View File

@ -662,6 +662,10 @@ Summary:
.. option:: -fno-assemble
.. option:: -fno-bit-scan-loops
Rarely needed. Disable converting bit counting loops into built-in operations.
.. option:: -fno-case
Rarely needed. Disable all case statement optimizations.

View File

@ -903,15 +903,31 @@ static inline IData VL_CLOG2_W(int words, WDataInP const lwp) VL_PURE {
return 0;
}
static inline IData VL_MOSTSETBITP1_I(IData lhs) VL_PURE {
if (VL_UNLIKELY(!lhs)) return 0; // __builtin_clz is undefined for 0
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
return VL_EDATASIZE - __builtin_clz(lhs);
#else
for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
if (VL_BITISSET_E(lhs, bit)) return bit + 1;
}
return 0; // LCOV_EXCL_LINE // Can't get here - one bit must be set
#endif
}
static inline IData VL_MOSTSETBITP1_Q(QData lhs) VL_PURE {
if (VL_UNLIKELY(!lhs)) return 0;
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
return 64 - __builtin_clzll(static_cast<unsigned long long>(lhs));
#else
const IData hi = static_cast<IData>(lhs >> 32ULL);
return hi ? (VL_EDATASIZE + VL_MOSTSETBITP1_I(hi))
: VL_MOSTSETBITP1_I(static_cast<IData>(lhs));
#endif
}
static inline IData VL_MOSTSETBITP1_W(int words, WDataInP const lwp) VL_PURE {
// MSB set bit plus one; similar to FLS. 0=value is zero
for (int i = words - 1; i >= 0; --i) {
if (VL_UNLIKELY(lwp[i])) { // Shorter worst case if predict not taken
for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) return i * VL_EDATASIZE + bit + 1;
}
// Can't get here - one bit must be set
}
// Shorter worst case if predict not taken
if (VL_UNLIKELY(lwp[i])) return i * VL_EDATASIZE + VL_MOSTSETBITP1_I(lwp[i]);
}
return 0;
}

View File

@ -5737,6 +5737,22 @@ public:
void dump(std::ostream& str) const override;
void dumpJson(std::ostream& str) const override;
};
class AstMostSetBitP1 final : public AstNodeUniop {
// Most-significant set bit plus one (bit-width); 0 if value is zero
public:
AstMostSetBitP1(FileLine* fl, AstNodeExpr* lhsp)
: ASTGEN_SUPER_MostSetBitP1(fl, lhsp) {
dtypeSetInteger2State();
}
ASTGEN_MEMBERS_AstMostSetBitP1;
void numberOperate(V3Number& out, const V3Number& lhs) override { out.opMostSetBitP1(lhs); }
string emitVerilog() override { return "%f$mostsetbitp1(%l)"; }
string emitC() override { return "VL_MOSTSETBITP1_%lq(%lW, %P, %li)"; }
bool cleanOut() const override { return true; }
bool cleanLhs() const override { return true; }
bool sizeMattersLhs() const override { return false; }
int instrCount() const override { return widthInstrs() * 16; }
};
class AstNToI final : public AstNodeUniop {
// String to any-size integral
public:

View File

@ -1464,6 +1464,20 @@ V3Number& V3Number::opCLog2(const V3Number& lhs) {
setZero();
return *this;
}
V3Number& V3Number::opMostSetBitP1(const V3Number& lhs) {
// Most-significant set bit plus one (bit-width / find-last-set); 0 if value is zero
NUM_ASSERT_OP_ARGS1(lhs);
NUM_ASSERT_LOGIC_ARGS1(lhs);
if (lhs.isFourState()) return setAllBitsX();
for (int bit = lhs.width() - 1; bit >= 0; bit--) {
if (lhs.bitIs1(bit)) {
setLong(bit + 1);
return *this;
}
}
setZero();
return *this;
}
V3Number& V3Number::opLogNot(const V3Number& lhs) {
NUM_ASSERT_OP_ARGS1(lhs);

View File

@ -761,6 +761,7 @@ public:
V3Number& opOneHot(const V3Number& lhs);
V3Number& opOneHot0(const V3Number& lhs);
V3Number& opCLog2(const V3Number& lhs);
V3Number& opMostSetBitP1(const V3Number& lhs);
V3Number& opClean(const V3Number& lhs, uint32_t bits);
V3Number& opConcat(const V3Number& lhs, const V3Number& rhs);
V3Number& opLenN(const V3Number& lhs);

View File

@ -1448,6 +1448,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp);
DECL_OPTION("-fassemble", FOnOff, &m_fAssemble);
DECL_OPTION("-fbit-scan-loops", FOnOff, &m_fBitScanLoops);
DECL_OPTION("-fcase", CbFOnOff, [this](bool flag) {
m_fCaseDecoder = flag;
m_fCaseTable = flag;
@ -2359,6 +2360,7 @@ void V3Options::optimize(int level) {
const bool flag = level > 0;
m_fAcycSimp = flag;
m_fAssemble = flag;
m_fBitScanLoops = flag;
m_fCaseDecoder = flag;
m_fCaseTable = flag;
m_fCaseTree = flag;

View File

@ -392,6 +392,7 @@ private:
// MEMBERS (optimizations)
bool m_fAcycSimp; // main switch: -fno-acyc-simp: acyclic pre-optimizations
bool m_fAssemble; // main switch: -fno-assemble: assign assemble
bool m_fBitScanLoops; // main switch: -fno-bit-scan-loops: convert bit scan loops to builtins
bool m_fCaseDecoder; // main switch: -fno-case-decoder: case decoder conversion
bool m_fCaseTable; // main switch: -fno-case-table: case table conversion
bool m_fCaseTree; // main switch: -fno-case-tree: case tree conversion
@ -731,6 +732,7 @@ public:
// ACCESSORS (optimization options)
bool fAcycSimp() const { return m_fAcycSimp; }
bool fAssemble() const { return m_fAssemble; }
bool fBitScanLoops() const { return m_fBitScanLoops; }
bool fCaseDecoder() const { return m_fCaseDecoder; }
bool fCaseTable() const { return m_fCaseTable; }
bool fCaseTree() const { return m_fCaseTree; }

View File

@ -62,6 +62,8 @@ struct UnrollStats final {
Stat m_nPragmaDisabled{"Pragma unroll_disable"};
Stat m_nUnrolledLoops{"Unrolled loops"};
Stat m_nUnrolledIters{"Unrolled iterations"};
Stat m_bitScanLowered{"Lowered priority-encoder to mostsetbitp1"};
Stat m_countOnesLowered{"Lowered count-set-bits to countones"};
};
//######################################################################
@ -422,6 +424,157 @@ class UnrollAllVisitor final : VNVisitor {
UnrollStats m_stats; // Statistic tracking
UnrolllBindings m_bindings; // Variable bindings
// METHODS
// Peel value-preserving width casts (Extend/ExtendS, or a low-bits Sel with lsb 0) to the
// underlying VarRef. A Sel kept narrower than 'minWidth' is a lossy narrowing (idx[1:0])
// and is rejected.
static AstVarRef* unwrapToVarRef(AstNodeExpr* nodep, int minWidth) {
while (true) {
if (AstVarRef* const refp = VN_CAST(nodep, VarRef)) return refp;
if (AstExtend* const ep = VN_CAST(nodep, Extend)) {
nodep = ep->lhsp();
} else if (AstExtendS* const ep = VN_CAST(nodep, ExtendS)) {
nodep = ep->lhsp();
} else if (AstSel* const sp = VN_CAST(nodep, Sel)) {
const AstConst* const lsbp = VN_CAST(sp->lsbp(), Const);
if (!lsbp || lsbp->toUInt() != 0 || sp->width() < minWidth) return nullptr;
nodep = sp->fromp();
} else {
return nullptr;
}
}
}
// True if 'nodep' is exactly '1 + var' for 'vscp' (V3Const puts the constant on the LHS).
// Passing the add's width as minWidth rejects a lossy increment like 32'(i[1:0]) + 1.
bool isVarPlus1(AstNode* nodep, const AstVarScope* vscp) {
AstAdd* const addp = VN_CAST(nodep, Add);
if (!addp || !addp->lhsp()->isOne()) return false;
const AstVarRef* const r = unwrapToVarRef(addp->rhsp(), addp->width());
return r && r->varScopep() == vscp;
}
// Resize the 32-bit reduction to the accumulator width; truncating the low bits matches
// the original counted loop's wrap-around.
static AstNodeExpr* resizeToWidth(AstNodeExpr* exprp, const AstVarRef* targetRefp) {
const int width = targetRefp->width();
if (width == 32) return exprp;
FileLine* const flp = exprp->fileline();
if (width < 32) return new AstSel{flp, exprp, 0, width};
AstExtend* const extp = new AstExtend{flp, exprp};
extp->dtypeFrom(targetRefp);
return extp;
}
// Match a strict ascending loop bound 'idx < W'. V3Const canonicalizes this to the
// 'W > idx' form (Gt unsigned, GtS signed), so only that form is matched.
static bool ascendingBound(AstNodeExpr* condp, AstConst*& wp, AstVarRef*& idxRefp) {
if (!VN_IS(condp, Gt) && !VN_IS(condp, GtS)) return false;
AstNodeBiop* const bp = VN_AS(condp, NodeBiop);
wp = VN_CAST(bp->lhsp(), Const);
idxRefp = VN_CAST(bp->rhsp(), VarRef);
return wp && idxRefp && !wp->num().isFourState();
}
// Recognize the redundant in-range guard Verilator auto-inserts for a select into a
// non-power-of-two vector. V3Const canonicalizes 'idx <= C' to '(C >= idx)' (Gte/GteS,
// const on the LHS), so only that form occurs; with C >= W-1 it is always true for idx
// in 0..W-1.
static bool isInRangeGuard(AstNodeExpr* condp, const AstVarScope* idxVscp, uint32_t width,
int addrBits) {
if (!VN_IS(condp, Gte) && !VN_IS(condp, GteS)) return false;
AstNodeBiop* const bp = VN_AS(condp, NodeBiop);
const AstConst* const cp = VN_CAST(bp->lhsp(), Const);
if (!cp || cp->num().isFourState() || cp->toUInt() < width - 1) return false;
const AstVarRef* const r = unwrapToVarRef(bp->rhsp(), addrBits);
return r && r->varScopep() == idxVscp;
}
// Recognize a single-bit scan loop over all W bits of 'vec' (idx 0..W-1, target
// pre-zeroed) and lower it to a bit-reduction primitive. Two idioms are matched:
// target = 0; idx = 0;
// loop { looptest(W > idx); if (...vec[idx]...) target = <e>; idx = idx + 1; }
// where, when W == width(vec):
// <e> = idx + 1 => target = $mostsetbitp1(vec) (leading-one / bit-width)
// <e> = target + 1 => target = $countones(vec) (population count)
bool tryLowerBitScanLoop(AstLoop* loopp) {
AstLoopTest* const testp = VN_CAST(loopp->stmtsp(), LoopTest);
if (!testp) return false;
AstIf* const ifp = VN_CAST(testp->nextp(), If);
if (!ifp) return false;
AstAssign* const incp = VN_CAST(ifp->nextp(), Assign);
if (!incp || incp->nextp()) return false;
AstConst* wp = nullptr;
AstVarRef* idxRefp = nullptr;
if (!ascendingBound(testp->condp(), wp, idxRefp)) return false;
AstVarScope* const idxVscp = idxRefp->varScopep();
const uint32_t width = wp->toUInt();
// Bits needed to address all W bits of 'vec' (clog2(W)); a narrower index is lossy.
const int addrBits = width <= 1 ? 1 : V3Number::log2b(width - 1) + 1;
const AstConst* const idxInitp = m_bindings.get(idxVscp);
if (!idxInitp || !idxInitp->isZero()) return false;
AstVarRef* const incLhsp = VN_CAST(incp->lhsp(), VarRef);
if (!incLhsp || incLhsp->varScopep() != idxVscp) return false;
if (!isVarPlus1(incp->rhsp(), idxVscp)) return false;
if (ifp->elsesp()) return false;
AstAssign* const thenp = VN_CAST(ifp->thensp(), Assign);
if (!thenp || thenp->nextp()) return false;
AstVarRef* const targetRefp = VN_CAST(thenp->lhsp(), VarRef);
if (!targetRefp) return false;
AstVarScope* const targetVscp = targetRefp->varScopep();
if (targetVscp == idxVscp) return false;
const bool isLeadingOne = isVarPlus1(thenp->rhsp(), idxVscp);
const bool isCountOnes = !isLeadingOne && isVarPlus1(thenp->rhsp(), targetVscp);
if (!isLeadingOne && !isCountOnes) return false;
// If-cond is the 1-bit select 'vec[idx]', possibly wrapped in the redundant in-range
// guard Verilator auto-inserts (as 'guard && sel') for a non-power-of-two vector:
// '(idx <= W-1) && vec[idx]' (default / --x-assign 0; a LogAnd), or
// '(idx <= W-1) ? vec[idx] : <x>' (--x-assign unique; a Cond).
// The guard is always true for idx in 0..W-1, so peel it to reach the select. Any
// other compound condition (e.g. 'vec[idx] && en') leaves a non-select, rejected below.
AstNodeExpr* condp = ifp->condp();
if (AstLogAnd* const andp = VN_CAST(condp, LogAnd)) {
if (isInRangeGuard(andp->lhsp(), idxVscp, width, addrBits)) condp = andp->rhsp();
} else if (AstCond* const ternp = VN_CAST(condp, Cond)) {
if (isInRangeGuard(ternp->condp(), idxVscp, width, addrBits)) condp = ternp->thenp();
}
AstSel* const selp = VN_CAST(condp, Sel);
if (!selp || selp->width() != 1) return false;
const AstVarRef* const fromp = VN_CAST(selp->fromp(), VarRef);
if (!fromp) return false;
const AstVarScope* const fromVscp = fromp->varScopep();
if (fromVscp == idxVscp || fromVscp == targetVscp) return false;
AstNodeExpr* const vecExprp = selp->fromp();
// Must scan all W bits of 'vec', indexed by exactly 'idx' (address kept >= clog2(W),
// so a lossy narrowing like vec[idx[2:0]] is rejected).
if (static_cast<int>(width) != vecExprp->width()) return false;
const AstVarRef* const idxInSel = unwrapToVarRef(selp->lsbp(), addrBits);
if (!idxInSel || idxInSel->varScopep() != idxVscp) return false;
// 'target' must be const-0 immediately before the loop (collected in m_bindings),
// so that an all-zero 'vec' yields 0, matching $mostsetbitp1's definition.
const AstConst* const targetInitp = m_bindings.get(targetVscp);
if (!targetInitp || !targetInitp->isZero()) return false;
// Rewrite to 'target = <reduction>(vec); idx = W'. The 'idx = W' store preserves the
// loop's exit value, so this is sound even if idx is read afterwards (else DCE drops it).
FileLine* const flp = loopp->fileline();
AstNodeExpr* reducep;
if (isLeadingOne) {
reducep = new AstMostSetBitP1{flp, vecExprp->cloneTree(false)};
} else {
AstCountOnes* const conep = new AstCountOnes{flp, vecExprp->cloneTree(false)};
conep->dtypeSetInteger2State();
reducep = conep;
}
reducep = resizeToWidth(reducep, targetRefp);
AstAssign* const newp = new AstAssign{flp, targetRefp->cloneTree(false), reducep};
newp->addNext(new AstAssign{flp, incLhsp->cloneTree(false), wp->cloneTree(false)});
loopp->replaceWith(newp);
VL_DO_DANGLING(pushDeletep(loopp), loopp);
if (isLeadingOne) {
UINFO(4, "Lowered priority-encoder loop to $mostsetbitp1: " << newp);
++m_stats.m_bitScanLowered;
} else {
UINFO(4, "Lowered count-set-bits loop to $countones: " << newp);
++m_stats.m_countOnesLowered;
}
return true;
}
// VISIT
void visit(AstLoop* nodep) override {
// Gather variable bindings from the preceding statements
@ -450,6 +603,9 @@ class UnrollAllVisitor final : VNVisitor {
m_bindings.set(lhsp->varScopep(), valp);
}
// Recognize a bit counting loop and lower it to a builtin
if (v3Global.opt.fBitScanLoops() && tryLowerBitScanLoop(nodep)) return;
// Attempt to unroll this loop
const std::pair<AstNode*, bool> pair = UnrollOneVisitor::apply(m_stats, m_bindings, nodep);

View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2026 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
# --unroll-count 0 so the loops are recognized without relying on unrolling.
test.compile(verilator_flags2=['--stats', '--unroll-count', '0'])
# The leading-one positives lower to $mostsetbitp1, the count-ones positive to
# $countones; the negatives are left as loops (a wrong lowering would raise a count).
test.file_grep(test.stats,
r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)',
8)
test.file_grep(test.stats,
r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)',
1)
test.execute()
test.passes()

View File

@ -0,0 +1,169 @@
// DESCRIPTION: Verilator: Verilog Test module
//
// Exercises the bit-counting loop idioms that V3Unroll lowers to builtins:
// leading-one for (b=0;b<W;b++) if (vec[b]) n = b + 1; -> $mostsetbitp1(vec)
// count-ones for (b=0;b<W;b++) if (vec[b]) n = n + 1; -> $countones(vec)
// Positives must lower (counted via --stats by the .py); negatives compute a
// different value than the builtin and so must be left as loops.
//
// This file ONLY is placed under the Creative Commons Public Domain.
// SPDX-FileCopyrightText: 2026 Wilson Snyder
// SPDX-License-Identifier: CC0-1.0
// verilog_format: off
`define stop $stop
`define checkh(gotv,expv) do if ((gotv) !== (expv)) begin $write("%%Error: %s:%0d: got=%0x exp=%0x (%s !== %s)\n", `__FILE__,`__LINE__, (gotv), (expv), `"gotv`", `"expv`"); `stop; end while(0);
// verilog_format: on
module t (
input clk
);
// ---- positives: must lower ----
logic [31:0] p32;
logic [5:0] n32; // I path, narrow target (select resize)
logic [47:0] p48;
logic [6:0] n48; // Q path
logic [79:0] p80;
logic [6:0] n80; // W path
logic [31:0] pu;
logic [5:0] nu; // unsigned loop index
logic [31:0] p32e;
logic [31:0] n32e; // 32-bit target (no resize)
logic [31:0] p32w;
logic [39:0] n40; // >32-bit target (extend resize)
logic [31:0] pc;
logic [5:0] nc; // count-ones -> $countones
logic [31:0] kvec; // const (set in initial) -> exercises $mostsetbitp1 fold
logic [5:0] kn;
initial kvec = 32'h0000_0100;
logic [31:0] kvec0; // const 0 -> $mostsetbitp1(0)=0 (covers the zero path)
logic [5:0] kn0;
initial kvec0 = 32'h0;
always_comb begin
n32 = 0;
for (int b = 0; b < 32; b++) if (p32[b]) n32 = 6'(b + 1);
end
always_comb begin
n48 = 0;
for (int b = 0; b < 48; b++) if (p48[b]) n48 = 7'(b + 1);
end
always_comb begin
n80 = 0;
for (int b = 0; b < 80; b++) if (p80[b]) n80 = 7'(b + 1);
end
always_comb begin
nu = 0;
for (int unsigned b = 0; b < 32; b++) if (pu[b]) nu = 6'(b + 1);
end
always_comb begin
n32e = 0;
for (int b = 0; b < 32; b++) if (p32e[b]) n32e = 32'(b + 1);
end
always_comb begin
n40 = 0;
for (int b = 0; b < 32; b++) if (p32w[b]) n40 = 40'(b + 1);
end
always_comb begin
nc = 0;
for (int b = 0; b < 32; b++) if (pc[b]) nc = nc + 1;
end
always_comb begin
kn = 0;
for (int b = 0; b < 32; b++) if (kvec[b]) kn = 6'(b + 1);
end
always_comb begin
kn0 = 0;
for (int b = 0; b < 32; b++) if (kvec0[b]) kn0 = 6'(b + 1);
end
// ---- negatives: must NOT lower (each yields a different value than the builtin) ----
logic [31:0] vn; // shared input, bits {2,4,5,7}
logic [31:0] vw; // has a set bit above the scan bound
logic [31:0] vt; // for the truncated-index case
logic en1; // runtime gate for the compound-condition case
logic [5:0] e_step2;
logic [6:0] e_start1;
logic [6:0] e_mul;
logic [5:0] e_off;
logic [5:0] e_noP1;
logic [5:0] e_narrow;
logic [5:0] e_comp;
logic [5:0] e_trunc;
always_comb begin
e_step2 = 0;
for (int b = 0; b < 32; b += 2) if (vn[b]) e_step2 = 6'(b + 1);
end
always_comb begin
e_start1 = 0;
for (int b = 1; b < 32; b++) if (vn[b]) e_start1 = 7'(b + 1);
end
always_comb begin
e_mul = 0;
for (int b = 0; b < 32; b++) if (vn[b]) e_mul = 7'(2 * b + 1);
end
always_comb begin
e_off = 0;
for (int b = 0; b < 31; b++) if (vn[b+1]) e_off = 6'(b + 1);
end
always_comb begin
e_noP1 = 0;
for (int b = 0; b < 32; b++) if (vn[b]) e_noP1 = 6'(b);
end
always_comb begin
e_narrow = 0;
for (int b = 0; b < 16; b++) if (vw[b]) e_narrow = 6'(b + 1);
end
always_comb begin
e_comp = 0;
for (int b = 0; b < 32; b++) if (vn[b] && en1) e_comp = 6'(b + 1);
end
// verilator lint_off WIDTHEXPAND
always_comb begin
e_trunc = 0;
for (int b = 0; b < 32; b++) if (vt[b[2:0]]) e_trunc = 6'(b + 1);
end
// verilator lint_on WIDTHEXPAND
int cyc = 0;
always @(posedge clk) begin
cyc <= cyc + 1;
if (cyc == 0) begin
p32 <= 32'h8000_0000;
p48 <= 48'h0;
p48[47] <= 1'b1;
p80 <= 80'h0;
p80[79] <= 1'b1;
pu <= 32'h0001_0000; // bit 16
p32e <= 32'h8000_0000;
p32w <= 32'h8000_0000;
pc <= 32'hf0f0_f0f0; // 16 ones
vn <= 32'h0000_00b4; // bits {2,4,5,7}
vw <= 32'h0010_0008; // bits {3,20}
vt <= 32'h0000_0080; // bit 7
en1 <= 1'b0; // gate off -> compound loop yields 0
end
else if (cyc == 1) begin
`checkh(n32, 6'd32);
`checkh(n48, 7'd48);
`checkh(n80, 7'd80);
`checkh(nu, 6'd17); // unsigned-index leading-one, bit 16 -> 17
`checkh(n32e, 32'd32);
`checkh(n40, 40'd32);
`checkh(nc, 6'd16); // popcount(0xF0F0F0F0)
`checkh(kn, 6'd9); // mostsetbitp1(0x100), constant-folded
`checkh(kn0, 6'd0); // mostsetbitp1(0)=0, constant-folded (zero path)
// negatives, hand-computed for vn = 0xB4 (bits 2,4,5,7):
`checkh(e_step2, 6'd5); // highest even set bit (4) + 1
`checkh(e_start1, 7'd8); // highest set bit in [1,32) (7) + 1
`checkh(e_mul, 7'd15); // 2*7 + 1
`checkh(e_off, 6'd7); // idx where vec[idx+1]; highest 6 -> 7
`checkh(e_noP1, 6'd7); // highest set bit (7), no +1
`checkh(e_narrow, 6'd4); // W=16 != width(vec): only low bits scanned (bit 3)
`checkh(e_comp, 6'd0); // && en1 (=0); a wrong lowering would give 8
`checkh(e_trunc, 6'd32); // vt[b[2:0]] last hits b=31; a wrong lowering would give 8
$write("*-* All Finished *-*\n");
$finish;
end
end
endmodule

View File

@ -0,0 +1,24 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2026 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
# Reuse the same design; only the optimization switch differs.
test.top_filename = "t/t_bit_scan_loops.v"
test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '-fno-bit-scan-loops'])
# With the optimization disabled, nothing lowers.
test.file_grep(test.stats, r'Lowered priority-encoder to mostsetbitp1\s+([0-9])', 0)
test.execute()
test.passes()

View File

@ -0,0 +1,32 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2026 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt')
# Reuse the same design. '--x-assign 0' makes the auto-inserted out-of-range guard on a
# non-power-of-two bit-select a plain '(idx <= W-1) && vec[idx]' (AstLogAnd), rather than
# the ternary '(idx <= W-1) ? vec[idx] : <x>' (AstCond) produced under the driver's default
# '--x-assign unique'. This exercises the matcher's other guard-peel branch.
test.top_filename = "t/t_bit_scan_loops.v"
test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '--x-assign', '0'])
# Same lowering counts as the default run -- only the guard shape differs, not the result.
test.file_grep(test.stats,
r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)',
8)
test.file_grep(test.stats,
r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)',
1)
test.execute()
test.passes()