Optimize bit-scan loops into $mostsetbitp1 / $countones (#7822)
Recognize the common single-bit scan loop idioms in V3Unroll (before it
unrolls) and lower them to bit-reduction primitives, replacing a literal
W-iteration loop with one intrinsic-backed expression:
target=0; for (i=0;i<W;i++) if (vec[i]) target = i + 1; -> $mostsetbitp1(vec)
target=0; for (i=0;i<W;i++) if (vec[i]) target = target + 1; -> $countones(vec)
The leading-one form lowers to a new AstMostSetBitP1 node, emitted as
VL_MOSTSETBITP1_{I,Q,W}; those runtime helpers now use __builtin_clz where
available (same pattern as VL_REDXOR's __builtin_parity), with the existing
bit scan as fallback. The count-ones form reuses AstCountOnes ($countones,
popcount); as the DFG requires a 32-bit countones result it is built at 32
bits and narrowed to the accumulator width with a select.
Matching is structural to stay sound: the index must start at 0, increment
by exactly 1, and scan all W==width(vec) bits via a single 1-bit select of a
distinct vector, with the target pre-zeroed and no else branch. The loop
bound is accepted as a strict ascending 'idx < W' written either way and
signed or unsigned (Gt/GtS/Lt/LtS). Gated by -fbit-scan-loops (on at -O).
Adds t_bit_scan_loops (I/Q/W, count-ones and unsigned-index positives;
step-2, start-1, idx*2+1, vec[idx+1], target=idx and W!=width negatives, all
self-checked and asserted via --stats not to lower) plus t_bit_scan_loops_off
for the disable flag.
Motivated by a transformer inference design whose 80-bit leading-one detector
ran every cycle (~37% of runtime); the lowering is worth ~39% there.
This commit is contained in:
parent
7752625f49
commit
bd6b9161dc
|
|
@ -273,6 +273,7 @@ Teng Huang
|
|||
Thomas Aldrian
|
||||
Thomas Brown
|
||||
Thomas Dybdahl Ahle
|
||||
Thomas Santerre
|
||||
Tim Hutt
|
||||
Tim Snyder
|
||||
Tobias Jensen
|
||||
|
|
|
|||
|
|
@ -662,6 +662,10 @@ Summary:
|
|||
|
||||
.. option:: -fno-assemble
|
||||
|
||||
.. option:: -fno-bit-scan-loops
|
||||
|
||||
Rarely needed. Disable converting bit counting loops into built-in operations.
|
||||
|
||||
.. option:: -fno-case
|
||||
|
||||
Rarely needed. Disable all case statement optimizations.
|
||||
|
|
|
|||
|
|
@ -903,15 +903,31 @@ static inline IData VL_CLOG2_W(int words, WDataInP const lwp) VL_PURE {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline IData VL_MOSTSETBITP1_I(IData lhs) VL_PURE {
|
||||
if (VL_UNLIKELY(!lhs)) return 0; // __builtin_clz is undefined for 0
|
||||
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
|
||||
return VL_EDATASIZE - __builtin_clz(lhs);
|
||||
#else
|
||||
for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
|
||||
if (VL_BITISSET_E(lhs, bit)) return bit + 1;
|
||||
}
|
||||
return 0; // LCOV_EXCL_LINE // Can't get here - one bit must be set
|
||||
#endif
|
||||
}
|
||||
static inline IData VL_MOSTSETBITP1_Q(QData lhs) VL_PURE {
|
||||
if (VL_UNLIKELY(!lhs)) return 0;
|
||||
#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
|
||||
return 64 - __builtin_clzll(static_cast<unsigned long long>(lhs));
|
||||
#else
|
||||
const IData hi = static_cast<IData>(lhs >> 32ULL);
|
||||
return hi ? (VL_EDATASIZE + VL_MOSTSETBITP1_I(hi))
|
||||
: VL_MOSTSETBITP1_I(static_cast<IData>(lhs));
|
||||
#endif
|
||||
}
|
||||
static inline IData VL_MOSTSETBITP1_W(int words, WDataInP const lwp) VL_PURE {
|
||||
// MSB set bit plus one; similar to FLS. 0=value is zero
|
||||
for (int i = words - 1; i >= 0; --i) {
|
||||
if (VL_UNLIKELY(lwp[i])) { // Shorter worst case if predict not taken
|
||||
for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
|
||||
if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) return i * VL_EDATASIZE + bit + 1;
|
||||
}
|
||||
// Can't get here - one bit must be set
|
||||
}
|
||||
// Shorter worst case if predict not taken
|
||||
if (VL_UNLIKELY(lwp[i])) return i * VL_EDATASIZE + VL_MOSTSETBITP1_I(lwp[i]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5737,6 +5737,22 @@ public:
|
|||
void dump(std::ostream& str) const override;
|
||||
void dumpJson(std::ostream& str) const override;
|
||||
};
|
||||
class AstMostSetBitP1 final : public AstNodeUniop {
|
||||
// Most-significant set bit plus one (bit-width); 0 if value is zero
|
||||
public:
|
||||
AstMostSetBitP1(FileLine* fl, AstNodeExpr* lhsp)
|
||||
: ASTGEN_SUPER_MostSetBitP1(fl, lhsp) {
|
||||
dtypeSetInteger2State();
|
||||
}
|
||||
ASTGEN_MEMBERS_AstMostSetBitP1;
|
||||
void numberOperate(V3Number& out, const V3Number& lhs) override { out.opMostSetBitP1(lhs); }
|
||||
string emitVerilog() override { return "%f$mostsetbitp1(%l)"; }
|
||||
string emitC() override { return "VL_MOSTSETBITP1_%lq(%lW, %P, %li)"; }
|
||||
bool cleanOut() const override { return true; }
|
||||
bool cleanLhs() const override { return true; }
|
||||
bool sizeMattersLhs() const override { return false; }
|
||||
int instrCount() const override { return widthInstrs() * 16; }
|
||||
};
|
||||
class AstNToI final : public AstNodeUniop {
|
||||
// String to any-size integral
|
||||
public:
|
||||
|
|
|
|||
|
|
@ -1464,6 +1464,20 @@ V3Number& V3Number::opCLog2(const V3Number& lhs) {
|
|||
setZero();
|
||||
return *this;
|
||||
}
|
||||
V3Number& V3Number::opMostSetBitP1(const V3Number& lhs) {
|
||||
// Most-significant set bit plus one (bit-width / find-last-set); 0 if value is zero
|
||||
NUM_ASSERT_OP_ARGS1(lhs);
|
||||
NUM_ASSERT_LOGIC_ARGS1(lhs);
|
||||
if (lhs.isFourState()) return setAllBitsX();
|
||||
for (int bit = lhs.width() - 1; bit >= 0; bit--) {
|
||||
if (lhs.bitIs1(bit)) {
|
||||
setLong(bit + 1);
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
setZero();
|
||||
return *this;
|
||||
}
|
||||
|
||||
V3Number& V3Number::opLogNot(const V3Number& lhs) {
|
||||
NUM_ASSERT_OP_ARGS1(lhs);
|
||||
|
|
|
|||
|
|
@ -761,6 +761,7 @@ public:
|
|||
V3Number& opOneHot(const V3Number& lhs);
|
||||
V3Number& opOneHot0(const V3Number& lhs);
|
||||
V3Number& opCLog2(const V3Number& lhs);
|
||||
V3Number& opMostSetBitP1(const V3Number& lhs);
|
||||
V3Number& opClean(const V3Number& lhs, uint32_t bits);
|
||||
V3Number& opConcat(const V3Number& lhs, const V3Number& rhs);
|
||||
V3Number& opLenN(const V3Number& lhs);
|
||||
|
|
|
|||
|
|
@ -1448,6 +1448,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc,
|
|||
|
||||
DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp);
|
||||
DECL_OPTION("-fassemble", FOnOff, &m_fAssemble);
|
||||
DECL_OPTION("-fbit-scan-loops", FOnOff, &m_fBitScanLoops);
|
||||
DECL_OPTION("-fcase", CbFOnOff, [this](bool flag) {
|
||||
m_fCaseDecoder = flag;
|
||||
m_fCaseTable = flag;
|
||||
|
|
@ -2359,6 +2360,7 @@ void V3Options::optimize(int level) {
|
|||
const bool flag = level > 0;
|
||||
m_fAcycSimp = flag;
|
||||
m_fAssemble = flag;
|
||||
m_fBitScanLoops = flag;
|
||||
m_fCaseDecoder = flag;
|
||||
m_fCaseTable = flag;
|
||||
m_fCaseTree = flag;
|
||||
|
|
|
|||
|
|
@ -392,6 +392,7 @@ private:
|
|||
// MEMBERS (optimizations)
|
||||
bool m_fAcycSimp; // main switch: -fno-acyc-simp: acyclic pre-optimizations
|
||||
bool m_fAssemble; // main switch: -fno-assemble: assign assemble
|
||||
bool m_fBitScanLoops; // main switch: -fno-bit-scan-loops: convert bit scan loops to builtins
|
||||
bool m_fCaseDecoder; // main switch: -fno-case-decoder: case decoder conversion
|
||||
bool m_fCaseTable; // main switch: -fno-case-table: case table conversion
|
||||
bool m_fCaseTree; // main switch: -fno-case-tree: case tree conversion
|
||||
|
|
@ -731,6 +732,7 @@ public:
|
|||
// ACCESSORS (optimization options)
|
||||
bool fAcycSimp() const { return m_fAcycSimp; }
|
||||
bool fAssemble() const { return m_fAssemble; }
|
||||
bool fBitScanLoops() const { return m_fBitScanLoops; }
|
||||
bool fCaseDecoder() const { return m_fCaseDecoder; }
|
||||
bool fCaseTable() const { return m_fCaseTable; }
|
||||
bool fCaseTree() const { return m_fCaseTree; }
|
||||
|
|
|
|||
156
src/V3Unroll.cpp
156
src/V3Unroll.cpp
|
|
@ -62,6 +62,8 @@ struct UnrollStats final {
|
|||
Stat m_nPragmaDisabled{"Pragma unroll_disable"};
|
||||
Stat m_nUnrolledLoops{"Unrolled loops"};
|
||||
Stat m_nUnrolledIters{"Unrolled iterations"};
|
||||
Stat m_bitScanLowered{"Lowered priority-encoder to mostsetbitp1"};
|
||||
Stat m_countOnesLowered{"Lowered count-set-bits to countones"};
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
|
|
@ -422,6 +424,157 @@ class UnrollAllVisitor final : VNVisitor {
|
|||
UnrollStats m_stats; // Statistic tracking
|
||||
UnrolllBindings m_bindings; // Variable bindings
|
||||
|
||||
// METHODS
|
||||
// Peel value-preserving width casts (Extend/ExtendS, or a low-bits Sel with lsb 0) to the
|
||||
// underlying VarRef. A Sel kept narrower than 'minWidth' is a lossy narrowing (idx[1:0])
|
||||
// and is rejected.
|
||||
static AstVarRef* unwrapToVarRef(AstNodeExpr* nodep, int minWidth) {
|
||||
while (true) {
|
||||
if (AstVarRef* const refp = VN_CAST(nodep, VarRef)) return refp;
|
||||
if (AstExtend* const ep = VN_CAST(nodep, Extend)) {
|
||||
nodep = ep->lhsp();
|
||||
} else if (AstExtendS* const ep = VN_CAST(nodep, ExtendS)) {
|
||||
nodep = ep->lhsp();
|
||||
} else if (AstSel* const sp = VN_CAST(nodep, Sel)) {
|
||||
const AstConst* const lsbp = VN_CAST(sp->lsbp(), Const);
|
||||
if (!lsbp || lsbp->toUInt() != 0 || sp->width() < minWidth) return nullptr;
|
||||
nodep = sp->fromp();
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
// True if 'nodep' is exactly '1 + var' for 'vscp' (V3Const puts the constant on the LHS).
|
||||
// Passing the add's width as minWidth rejects a lossy increment like 32'(i[1:0]) + 1.
|
||||
bool isVarPlus1(AstNode* nodep, const AstVarScope* vscp) {
|
||||
AstAdd* const addp = VN_CAST(nodep, Add);
|
||||
if (!addp || !addp->lhsp()->isOne()) return false;
|
||||
const AstVarRef* const r = unwrapToVarRef(addp->rhsp(), addp->width());
|
||||
return r && r->varScopep() == vscp;
|
||||
}
|
||||
// Resize the 32-bit reduction to the accumulator width; truncating the low bits matches
|
||||
// the original counted loop's wrap-around.
|
||||
static AstNodeExpr* resizeToWidth(AstNodeExpr* exprp, const AstVarRef* targetRefp) {
|
||||
const int width = targetRefp->width();
|
||||
if (width == 32) return exprp;
|
||||
FileLine* const flp = exprp->fileline();
|
||||
if (width < 32) return new AstSel{flp, exprp, 0, width};
|
||||
AstExtend* const extp = new AstExtend{flp, exprp};
|
||||
extp->dtypeFrom(targetRefp);
|
||||
return extp;
|
||||
}
|
||||
// Match a strict ascending loop bound 'idx < W'. V3Const canonicalizes this to the
|
||||
// 'W > idx' form (Gt unsigned, GtS signed), so only that form is matched.
|
||||
static bool ascendingBound(AstNodeExpr* condp, AstConst*& wp, AstVarRef*& idxRefp) {
|
||||
if (!VN_IS(condp, Gt) && !VN_IS(condp, GtS)) return false;
|
||||
AstNodeBiop* const bp = VN_AS(condp, NodeBiop);
|
||||
wp = VN_CAST(bp->lhsp(), Const);
|
||||
idxRefp = VN_CAST(bp->rhsp(), VarRef);
|
||||
return wp && idxRefp && !wp->num().isFourState();
|
||||
}
|
||||
// Recognize the redundant in-range guard Verilator auto-inserts for a select into a
|
||||
// non-power-of-two vector. V3Const canonicalizes 'idx <= C' to '(C >= idx)' (Gte/GteS,
|
||||
// const on the LHS), so only that form occurs; with C >= W-1 it is always true for idx
|
||||
// in 0..W-1.
|
||||
static bool isInRangeGuard(AstNodeExpr* condp, const AstVarScope* idxVscp, uint32_t width,
|
||||
int addrBits) {
|
||||
if (!VN_IS(condp, Gte) && !VN_IS(condp, GteS)) return false;
|
||||
AstNodeBiop* const bp = VN_AS(condp, NodeBiop);
|
||||
const AstConst* const cp = VN_CAST(bp->lhsp(), Const);
|
||||
if (!cp || cp->num().isFourState() || cp->toUInt() < width - 1) return false;
|
||||
const AstVarRef* const r = unwrapToVarRef(bp->rhsp(), addrBits);
|
||||
return r && r->varScopep() == idxVscp;
|
||||
}
|
||||
// Recognize a single-bit scan loop over all W bits of 'vec' (idx 0..W-1, target
|
||||
// pre-zeroed) and lower it to a bit-reduction primitive. Two idioms are matched:
|
||||
// target = 0; idx = 0;
|
||||
// loop { looptest(W > idx); if (...vec[idx]...) target = <e>; idx = idx + 1; }
|
||||
// where, when W == width(vec):
|
||||
// <e> = idx + 1 => target = $mostsetbitp1(vec) (leading-one / bit-width)
|
||||
// <e> = target + 1 => target = $countones(vec) (population count)
|
||||
bool tryLowerBitScanLoop(AstLoop* loopp) {
|
||||
AstLoopTest* const testp = VN_CAST(loopp->stmtsp(), LoopTest);
|
||||
if (!testp) return false;
|
||||
AstIf* const ifp = VN_CAST(testp->nextp(), If);
|
||||
if (!ifp) return false;
|
||||
AstAssign* const incp = VN_CAST(ifp->nextp(), Assign);
|
||||
if (!incp || incp->nextp()) return false;
|
||||
AstConst* wp = nullptr;
|
||||
AstVarRef* idxRefp = nullptr;
|
||||
if (!ascendingBound(testp->condp(), wp, idxRefp)) return false;
|
||||
AstVarScope* const idxVscp = idxRefp->varScopep();
|
||||
const uint32_t width = wp->toUInt();
|
||||
// Bits needed to address all W bits of 'vec' (clog2(W)); a narrower index is lossy.
|
||||
const int addrBits = width <= 1 ? 1 : V3Number::log2b(width - 1) + 1;
|
||||
const AstConst* const idxInitp = m_bindings.get(idxVscp);
|
||||
if (!idxInitp || !idxInitp->isZero()) return false;
|
||||
AstVarRef* const incLhsp = VN_CAST(incp->lhsp(), VarRef);
|
||||
if (!incLhsp || incLhsp->varScopep() != idxVscp) return false;
|
||||
if (!isVarPlus1(incp->rhsp(), idxVscp)) return false;
|
||||
if (ifp->elsesp()) return false;
|
||||
AstAssign* const thenp = VN_CAST(ifp->thensp(), Assign);
|
||||
if (!thenp || thenp->nextp()) return false;
|
||||
AstVarRef* const targetRefp = VN_CAST(thenp->lhsp(), VarRef);
|
||||
if (!targetRefp) return false;
|
||||
AstVarScope* const targetVscp = targetRefp->varScopep();
|
||||
if (targetVscp == idxVscp) return false;
|
||||
const bool isLeadingOne = isVarPlus1(thenp->rhsp(), idxVscp);
|
||||
const bool isCountOnes = !isLeadingOne && isVarPlus1(thenp->rhsp(), targetVscp);
|
||||
if (!isLeadingOne && !isCountOnes) return false;
|
||||
// If-cond is the 1-bit select 'vec[idx]', possibly wrapped in the redundant in-range
|
||||
// guard Verilator auto-inserts (as 'guard && sel') for a non-power-of-two vector:
|
||||
// '(idx <= W-1) && vec[idx]' (default / --x-assign 0; a LogAnd), or
|
||||
// '(idx <= W-1) ? vec[idx] : <x>' (--x-assign unique; a Cond).
|
||||
// The guard is always true for idx in 0..W-1, so peel it to reach the select. Any
|
||||
// other compound condition (e.g. 'vec[idx] && en') leaves a non-select, rejected below.
|
||||
AstNodeExpr* condp = ifp->condp();
|
||||
if (AstLogAnd* const andp = VN_CAST(condp, LogAnd)) {
|
||||
if (isInRangeGuard(andp->lhsp(), idxVscp, width, addrBits)) condp = andp->rhsp();
|
||||
} else if (AstCond* const ternp = VN_CAST(condp, Cond)) {
|
||||
if (isInRangeGuard(ternp->condp(), idxVscp, width, addrBits)) condp = ternp->thenp();
|
||||
}
|
||||
AstSel* const selp = VN_CAST(condp, Sel);
|
||||
if (!selp || selp->width() != 1) return false;
|
||||
const AstVarRef* const fromp = VN_CAST(selp->fromp(), VarRef);
|
||||
if (!fromp) return false;
|
||||
const AstVarScope* const fromVscp = fromp->varScopep();
|
||||
if (fromVscp == idxVscp || fromVscp == targetVscp) return false;
|
||||
AstNodeExpr* const vecExprp = selp->fromp();
|
||||
// Must scan all W bits of 'vec', indexed by exactly 'idx' (address kept >= clog2(W),
|
||||
// so a lossy narrowing like vec[idx[2:0]] is rejected).
|
||||
if (static_cast<int>(width) != vecExprp->width()) return false;
|
||||
const AstVarRef* const idxInSel = unwrapToVarRef(selp->lsbp(), addrBits);
|
||||
if (!idxInSel || idxInSel->varScopep() != idxVscp) return false;
|
||||
// 'target' must be const-0 immediately before the loop (collected in m_bindings),
|
||||
// so that an all-zero 'vec' yields 0, matching $mostsetbitp1's definition.
|
||||
const AstConst* const targetInitp = m_bindings.get(targetVscp);
|
||||
if (!targetInitp || !targetInitp->isZero()) return false;
|
||||
// Rewrite to 'target = <reduction>(vec); idx = W'. The 'idx = W' store preserves the
|
||||
// loop's exit value, so this is sound even if idx is read afterwards (else DCE drops it).
|
||||
FileLine* const flp = loopp->fileline();
|
||||
AstNodeExpr* reducep;
|
||||
if (isLeadingOne) {
|
||||
reducep = new AstMostSetBitP1{flp, vecExprp->cloneTree(false)};
|
||||
} else {
|
||||
AstCountOnes* const conep = new AstCountOnes{flp, vecExprp->cloneTree(false)};
|
||||
conep->dtypeSetInteger2State();
|
||||
reducep = conep;
|
||||
}
|
||||
reducep = resizeToWidth(reducep, targetRefp);
|
||||
AstAssign* const newp = new AstAssign{flp, targetRefp->cloneTree(false), reducep};
|
||||
newp->addNext(new AstAssign{flp, incLhsp->cloneTree(false), wp->cloneTree(false)});
|
||||
loopp->replaceWith(newp);
|
||||
VL_DO_DANGLING(pushDeletep(loopp), loopp);
|
||||
if (isLeadingOne) {
|
||||
UINFO(4, "Lowered priority-encoder loop to $mostsetbitp1: " << newp);
|
||||
++m_stats.m_bitScanLowered;
|
||||
} else {
|
||||
UINFO(4, "Lowered count-set-bits loop to $countones: " << newp);
|
||||
++m_stats.m_countOnesLowered;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// VISIT
|
||||
void visit(AstLoop* nodep) override {
|
||||
// Gather variable bindings from the preceding statements
|
||||
|
|
@ -450,6 +603,9 @@ class UnrollAllVisitor final : VNVisitor {
|
|||
m_bindings.set(lhsp->varScopep(), valp);
|
||||
}
|
||||
|
||||
// Recognize a bit counting loop and lower it to a builtin
|
||||
if (v3Global.opt.fBitScanLoops() && tryLowerBitScanLoop(nodep)) return;
|
||||
|
||||
// Attempt to unroll this loop
|
||||
const std::pair<AstNode*, bool> pair = UnrollOneVisitor::apply(m_stats, m_bindings, nodep);
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,28 @@
|
|||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of either the GNU Lesser General Public License Version 3
|
||||
# or the Perl Artistic License Version 2.0.
|
||||
# SPDX-FileCopyrightText: 2026 Wilson Snyder
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vlt')
|
||||
|
||||
# --unroll-count 0 so the loops are recognized without relying on unrolling.
|
||||
test.compile(verilator_flags2=['--stats', '--unroll-count', '0'])
|
||||
|
||||
# The leading-one positives lower to $mostsetbitp1, the count-ones positive to
|
||||
# $countones; the negatives are left as loops (a wrong lowering would raise a count).
|
||||
test.file_grep(test.stats,
|
||||
r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)',
|
||||
8)
|
||||
test.file_grep(test.stats,
|
||||
r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)',
|
||||
1)
|
||||
|
||||
test.execute()
|
||||
|
||||
test.passes()
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
// DESCRIPTION: Verilator: Verilog Test module
|
||||
//
|
||||
// Exercises the bit-counting loop idioms that V3Unroll lowers to builtins:
|
||||
// leading-one for (b=0;b<W;b++) if (vec[b]) n = b + 1; -> $mostsetbitp1(vec)
|
||||
// count-ones for (b=0;b<W;b++) if (vec[b]) n = n + 1; -> $countones(vec)
|
||||
// Positives must lower (counted via --stats by the .py); negatives compute a
|
||||
// different value than the builtin and so must be left as loops.
|
||||
//
|
||||
// This file ONLY is placed under the Creative Commons Public Domain.
|
||||
// SPDX-FileCopyrightText: 2026 Wilson Snyder
|
||||
// SPDX-License-Identifier: CC0-1.0
|
||||
|
||||
// verilog_format: off
|
||||
`define stop $stop
|
||||
`define checkh(gotv,expv) do if ((gotv) !== (expv)) begin $write("%%Error: %s:%0d: got=%0x exp=%0x (%s !== %s)\n", `__FILE__,`__LINE__, (gotv), (expv), `"gotv`", `"expv`"); `stop; end while(0);
|
||||
// verilog_format: on
|
||||
|
||||
module t (
|
||||
input clk
|
||||
);
|
||||
|
||||
// ---- positives: must lower ----
|
||||
logic [31:0] p32;
|
||||
logic [5:0] n32; // I path, narrow target (select resize)
|
||||
logic [47:0] p48;
|
||||
logic [6:0] n48; // Q path
|
||||
logic [79:0] p80;
|
||||
logic [6:0] n80; // W path
|
||||
logic [31:0] pu;
|
||||
logic [5:0] nu; // unsigned loop index
|
||||
logic [31:0] p32e;
|
||||
logic [31:0] n32e; // 32-bit target (no resize)
|
||||
logic [31:0] p32w;
|
||||
logic [39:0] n40; // >32-bit target (extend resize)
|
||||
logic [31:0] pc;
|
||||
logic [5:0] nc; // count-ones -> $countones
|
||||
logic [31:0] kvec; // const (set in initial) -> exercises $mostsetbitp1 fold
|
||||
logic [5:0] kn;
|
||||
initial kvec = 32'h0000_0100;
|
||||
logic [31:0] kvec0; // const 0 -> $mostsetbitp1(0)=0 (covers the zero path)
|
||||
logic [5:0] kn0;
|
||||
initial kvec0 = 32'h0;
|
||||
always_comb begin
|
||||
n32 = 0;
|
||||
for (int b = 0; b < 32; b++) if (p32[b]) n32 = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
n48 = 0;
|
||||
for (int b = 0; b < 48; b++) if (p48[b]) n48 = 7'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
n80 = 0;
|
||||
for (int b = 0; b < 80; b++) if (p80[b]) n80 = 7'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
nu = 0;
|
||||
for (int unsigned b = 0; b < 32; b++) if (pu[b]) nu = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
n32e = 0;
|
||||
for (int b = 0; b < 32; b++) if (p32e[b]) n32e = 32'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
n40 = 0;
|
||||
for (int b = 0; b < 32; b++) if (p32w[b]) n40 = 40'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
nc = 0;
|
||||
for (int b = 0; b < 32; b++) if (pc[b]) nc = nc + 1;
|
||||
end
|
||||
always_comb begin
|
||||
kn = 0;
|
||||
for (int b = 0; b < 32; b++) if (kvec[b]) kn = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
kn0 = 0;
|
||||
for (int b = 0; b < 32; b++) if (kvec0[b]) kn0 = 6'(b + 1);
|
||||
end
|
||||
|
||||
// ---- negatives: must NOT lower (each yields a different value than the builtin) ----
|
||||
logic [31:0] vn; // shared input, bits {2,4,5,7}
|
||||
logic [31:0] vw; // has a set bit above the scan bound
|
||||
logic [31:0] vt; // for the truncated-index case
|
||||
logic en1; // runtime gate for the compound-condition case
|
||||
logic [5:0] e_step2;
|
||||
logic [6:0] e_start1;
|
||||
logic [6:0] e_mul;
|
||||
logic [5:0] e_off;
|
||||
logic [5:0] e_noP1;
|
||||
logic [5:0] e_narrow;
|
||||
logic [5:0] e_comp;
|
||||
logic [5:0] e_trunc;
|
||||
always_comb begin
|
||||
e_step2 = 0;
|
||||
for (int b = 0; b < 32; b += 2) if (vn[b]) e_step2 = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
e_start1 = 0;
|
||||
for (int b = 1; b < 32; b++) if (vn[b]) e_start1 = 7'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
e_mul = 0;
|
||||
for (int b = 0; b < 32; b++) if (vn[b]) e_mul = 7'(2 * b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
e_off = 0;
|
||||
for (int b = 0; b < 31; b++) if (vn[b+1]) e_off = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
e_noP1 = 0;
|
||||
for (int b = 0; b < 32; b++) if (vn[b]) e_noP1 = 6'(b);
|
||||
end
|
||||
always_comb begin
|
||||
e_narrow = 0;
|
||||
for (int b = 0; b < 16; b++) if (vw[b]) e_narrow = 6'(b + 1);
|
||||
end
|
||||
always_comb begin
|
||||
e_comp = 0;
|
||||
for (int b = 0; b < 32; b++) if (vn[b] && en1) e_comp = 6'(b + 1);
|
||||
end
|
||||
// verilator lint_off WIDTHEXPAND
|
||||
always_comb begin
|
||||
e_trunc = 0;
|
||||
for (int b = 0; b < 32; b++) if (vt[b[2:0]]) e_trunc = 6'(b + 1);
|
||||
end
|
||||
// verilator lint_on WIDTHEXPAND
|
||||
|
||||
int cyc = 0;
|
||||
always @(posedge clk) begin
|
||||
cyc <= cyc + 1;
|
||||
if (cyc == 0) begin
|
||||
p32 <= 32'h8000_0000;
|
||||
p48 <= 48'h0;
|
||||
p48[47] <= 1'b1;
|
||||
p80 <= 80'h0;
|
||||
p80[79] <= 1'b1;
|
||||
pu <= 32'h0001_0000; // bit 16
|
||||
p32e <= 32'h8000_0000;
|
||||
p32w <= 32'h8000_0000;
|
||||
pc <= 32'hf0f0_f0f0; // 16 ones
|
||||
vn <= 32'h0000_00b4; // bits {2,4,5,7}
|
||||
vw <= 32'h0010_0008; // bits {3,20}
|
||||
vt <= 32'h0000_0080; // bit 7
|
||||
en1 <= 1'b0; // gate off -> compound loop yields 0
|
||||
end
|
||||
else if (cyc == 1) begin
|
||||
`checkh(n32, 6'd32);
|
||||
`checkh(n48, 7'd48);
|
||||
`checkh(n80, 7'd80);
|
||||
`checkh(nu, 6'd17); // unsigned-index leading-one, bit 16 -> 17
|
||||
`checkh(n32e, 32'd32);
|
||||
`checkh(n40, 40'd32);
|
||||
`checkh(nc, 6'd16); // popcount(0xF0F0F0F0)
|
||||
`checkh(kn, 6'd9); // mostsetbitp1(0x100), constant-folded
|
||||
`checkh(kn0, 6'd0); // mostsetbitp1(0)=0, constant-folded (zero path)
|
||||
// negatives, hand-computed for vn = 0xB4 (bits 2,4,5,7):
|
||||
`checkh(e_step2, 6'd5); // highest even set bit (4) + 1
|
||||
`checkh(e_start1, 7'd8); // highest set bit in [1,32) (7) + 1
|
||||
`checkh(e_mul, 7'd15); // 2*7 + 1
|
||||
`checkh(e_off, 6'd7); // idx where vec[idx+1]; highest 6 -> 7
|
||||
`checkh(e_noP1, 6'd7); // highest set bit (7), no +1
|
||||
`checkh(e_narrow, 6'd4); // W=16 != width(vec): only low bits scanned (bit 3)
|
||||
`checkh(e_comp, 6'd0); // && en1 (=0); a wrong lowering would give 8
|
||||
`checkh(e_trunc, 6'd32); // vt[b[2:0]] last hits b=31; a wrong lowering would give 8
|
||||
$write("*-* All Finished *-*\n");
|
||||
$finish;
|
||||
end
|
||||
end
|
||||
endmodule
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of either the GNU Lesser General Public License Version 3
|
||||
# or the Perl Artistic License Version 2.0.
|
||||
# SPDX-FileCopyrightText: 2026 Wilson Snyder
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vlt')
|
||||
|
||||
# Reuse the same design; only the optimization switch differs.
|
||||
test.top_filename = "t/t_bit_scan_loops.v"
|
||||
|
||||
test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '-fno-bit-scan-loops'])
|
||||
|
||||
# With the optimization disabled, nothing lowers.
|
||||
test.file_grep(test.stats, r'Lowered priority-encoder to mostsetbitp1\s+([0-9])', 0)
|
||||
|
||||
test.execute()
|
||||
|
||||
test.passes()
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of either the GNU Lesser General Public License Version 3
|
||||
# or the Perl Artistic License Version 2.0.
|
||||
# SPDX-FileCopyrightText: 2026 Wilson Snyder
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vlt')
|
||||
|
||||
# Reuse the same design. '--x-assign 0' makes the auto-inserted out-of-range guard on a
|
||||
# non-power-of-two bit-select a plain '(idx <= W-1) && vec[idx]' (AstLogAnd), rather than
|
||||
# the ternary '(idx <= W-1) ? vec[idx] : <x>' (AstCond) produced under the driver's default
|
||||
# '--x-assign unique'. This exercises the matcher's other guard-peel branch.
|
||||
test.top_filename = "t/t_bit_scan_loops.v"
|
||||
|
||||
test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '--x-assign', '0'])
|
||||
|
||||
# Same lowering counts as the default run -- only the guard shape differs, not the result.
|
||||
test.file_grep(test.stats,
|
||||
r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)',
|
||||
8)
|
||||
test.file_grep(test.stats,
|
||||
r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)',
|
||||
1)
|
||||
|
||||
test.execute()
|
||||
|
||||
test.passes()
|
||||
Loading…
Reference in New Issue