From bd6b9161dce2fa1a0d60a099a2f48a654350c134 Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Wed, 24 Jun 2026 05:43:05 -0400 Subject: [PATCH] Optimize bit-scan loops into $mostsetbitp1 / $countones (#7822) Recognize the common single-bit scan loop idioms in V3Unroll (before it unrolls) and lower them to bit-reduction primitives, replacing a literal W-iteration loop with one intrinsic-backed expression: target=0; for (i=0;i $mostsetbitp1(vec) target=0; for (i=0;i $countones(vec) The leading-one form lowers to a new AstMostSetBitP1 node, emitted as VL_MOSTSETBITP1_{I,Q,W}; those runtime helpers now use __builtin_clz where available (same pattern as VL_REDXOR's __builtin_parity), with the existing bit scan as fallback. The count-ones form reuses AstCountOnes ($countones, popcount); as the DFG requires a 32-bit countones result it is built at 32 bits and narrowed to the accumulator width with a select. Matching is structural to stay sound: the index must start at 0, increment by exactly 1, and scan all W==width(vec) bits via a single 1-bit select of a distinct vector, with the target pre-zeroed and no else branch. The loop bound is accepted as a strict ascending 'idx < W' written either way and signed or unsigned (Gt/GtS/Lt/LtS). Gated by -fbit-scan-loops (on at -O). Adds t_bit_scan_loops (I/Q/W, count-ones and unsigned-index positives; step-2, start-1, idx*2+1, vec[idx+1], target=idx and W!=width negatives, all self-checked and asserted via --stats not to lower) plus t_bit_scan_loops_off for the disable flag. Motivated by a transformer inference design whose 80-bit leading-one detector ran every cycle (~37% of runtime); the lowering is worth ~39% there. --- docs/CONTRIBUTORS | 1 + docs/guide/exe_verilator.rst | 4 + include/verilated_funcs.h | 30 +++- src/V3AstNodeExpr.h | 16 ++ src/V3Number.cpp | 14 ++ src/V3Number.h | 1 + src/V3Options.cpp | 2 + src/V3Options.h | 2 + src/V3Unroll.cpp | 156 ++++++++++++++++++ test_regress/t/t_bit_scan_loops.py | 28 ++++ test_regress/t/t_bit_scan_loops.v | 169 ++++++++++++++++++++ test_regress/t/t_bit_scan_loops_off.py | 24 +++ test_regress/t/t_bit_scan_loops_xassign0.py | 32 ++++ 13 files changed, 472 insertions(+), 7 deletions(-) create mode 100755 test_regress/t/t_bit_scan_loops.py create mode 100644 test_regress/t/t_bit_scan_loops.v create mode 100755 test_regress/t/t_bit_scan_loops_off.py create mode 100755 test_regress/t/t_bit_scan_loops_xassign0.py diff --git a/docs/CONTRIBUTORS b/docs/CONTRIBUTORS index 44f7ecb2c..a593324c8 100644 --- a/docs/CONTRIBUTORS +++ b/docs/CONTRIBUTORS @@ -273,6 +273,7 @@ Teng Huang Thomas Aldrian Thomas Brown Thomas Dybdahl Ahle +Thomas Santerre Tim Hutt Tim Snyder Tobias Jensen diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 952500400..807216565 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -662,6 +662,10 @@ Summary: .. option:: -fno-assemble +.. option:: -fno-bit-scan-loops + + Rarely needed. Disable converting bit counting loops into built-in operations. + .. option:: -fno-case Rarely needed. Disable all case statement optimizations. diff --git a/include/verilated_funcs.h b/include/verilated_funcs.h index d69bca256..88da40387 100644 --- a/include/verilated_funcs.h +++ b/include/verilated_funcs.h @@ -903,15 +903,31 @@ static inline IData VL_CLOG2_W(int words, WDataInP const lwp) VL_PURE { return 0; } +static inline IData VL_MOSTSETBITP1_I(IData lhs) VL_PURE { + if (VL_UNLIKELY(!lhs)) return 0; // __builtin_clz is undefined for 0 +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS) + return VL_EDATASIZE - __builtin_clz(lhs); +#else + for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) { + if (VL_BITISSET_E(lhs, bit)) return bit + 1; + } + return 0; // LCOV_EXCL_LINE // Can't get here - one bit must be set +#endif +} +static inline IData VL_MOSTSETBITP1_Q(QData lhs) VL_PURE { + if (VL_UNLIKELY(!lhs)) return 0; +#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS) + return 64 - __builtin_clzll(static_cast(lhs)); +#else + const IData hi = static_cast(lhs >> 32ULL); + return hi ? (VL_EDATASIZE + VL_MOSTSETBITP1_I(hi)) + : VL_MOSTSETBITP1_I(static_cast(lhs)); +#endif +} static inline IData VL_MOSTSETBITP1_W(int words, WDataInP const lwp) VL_PURE { - // MSB set bit plus one; similar to FLS. 0=value is zero for (int i = words - 1; i >= 0; --i) { - if (VL_UNLIKELY(lwp[i])) { // Shorter worst case if predict not taken - for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) { - if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) return i * VL_EDATASIZE + bit + 1; - } - // Can't get here - one bit must be set - } + // Shorter worst case if predict not taken + if (VL_UNLIKELY(lwp[i])) return i * VL_EDATASIZE + VL_MOSTSETBITP1_I(lwp[i]); } return 0; } diff --git a/src/V3AstNodeExpr.h b/src/V3AstNodeExpr.h index 85c5b0892..e8f065861 100644 --- a/src/V3AstNodeExpr.h +++ b/src/V3AstNodeExpr.h @@ -5737,6 +5737,22 @@ public: void dump(std::ostream& str) const override; void dumpJson(std::ostream& str) const override; }; +class AstMostSetBitP1 final : public AstNodeUniop { + // Most-significant set bit plus one (bit-width); 0 if value is zero +public: + AstMostSetBitP1(FileLine* fl, AstNodeExpr* lhsp) + : ASTGEN_SUPER_MostSetBitP1(fl, lhsp) { + dtypeSetInteger2State(); + } + ASTGEN_MEMBERS_AstMostSetBitP1; + void numberOperate(V3Number& out, const V3Number& lhs) override { out.opMostSetBitP1(lhs); } + string emitVerilog() override { return "%f$mostsetbitp1(%l)"; } + string emitC() override { return "VL_MOSTSETBITP1_%lq(%lW, %P, %li)"; } + bool cleanOut() const override { return true; } + bool cleanLhs() const override { return true; } + bool sizeMattersLhs() const override { return false; } + int instrCount() const override { return widthInstrs() * 16; } +}; class AstNToI final : public AstNodeUniop { // String to any-size integral public: diff --git a/src/V3Number.cpp b/src/V3Number.cpp index a9c505359..d97e4ed50 100644 --- a/src/V3Number.cpp +++ b/src/V3Number.cpp @@ -1464,6 +1464,20 @@ V3Number& V3Number::opCLog2(const V3Number& lhs) { setZero(); return *this; } +V3Number& V3Number::opMostSetBitP1(const V3Number& lhs) { + // Most-significant set bit plus one (bit-width / find-last-set); 0 if value is zero + NUM_ASSERT_OP_ARGS1(lhs); + NUM_ASSERT_LOGIC_ARGS1(lhs); + if (lhs.isFourState()) return setAllBitsX(); + for (int bit = lhs.width() - 1; bit >= 0; bit--) { + if (lhs.bitIs1(bit)) { + setLong(bit + 1); + return *this; + } + } + setZero(); + return *this; +} V3Number& V3Number::opLogNot(const V3Number& lhs) { NUM_ASSERT_OP_ARGS1(lhs); diff --git a/src/V3Number.h b/src/V3Number.h index 3791f2f19..d81e32d72 100644 --- a/src/V3Number.h +++ b/src/V3Number.h @@ -761,6 +761,7 @@ public: V3Number& opOneHot(const V3Number& lhs); V3Number& opOneHot0(const V3Number& lhs); V3Number& opCLog2(const V3Number& lhs); + V3Number& opMostSetBitP1(const V3Number& lhs); V3Number& opClean(const V3Number& lhs, uint32_t bits); V3Number& opConcat(const V3Number& lhs, const V3Number& rhs); V3Number& opLenN(const V3Number& lhs); diff --git a/src/V3Options.cpp b/src/V3Options.cpp index a23a408b2..7a85c848d 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -1448,6 +1448,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp); DECL_OPTION("-fassemble", FOnOff, &m_fAssemble); + DECL_OPTION("-fbit-scan-loops", FOnOff, &m_fBitScanLoops); DECL_OPTION("-fcase", CbFOnOff, [this](bool flag) { m_fCaseDecoder = flag; m_fCaseTable = flag; @@ -2359,6 +2360,7 @@ void V3Options::optimize(int level) { const bool flag = level > 0; m_fAcycSimp = flag; m_fAssemble = flag; + m_fBitScanLoops = flag; m_fCaseDecoder = flag; m_fCaseTable = flag; m_fCaseTree = flag; diff --git a/src/V3Options.h b/src/V3Options.h index f3dd59863..182379fe9 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -392,6 +392,7 @@ private: // MEMBERS (optimizations) bool m_fAcycSimp; // main switch: -fno-acyc-simp: acyclic pre-optimizations bool m_fAssemble; // main switch: -fno-assemble: assign assemble + bool m_fBitScanLoops; // main switch: -fno-bit-scan-loops: convert bit scan loops to builtins bool m_fCaseDecoder; // main switch: -fno-case-decoder: case decoder conversion bool m_fCaseTable; // main switch: -fno-case-table: case table conversion bool m_fCaseTree; // main switch: -fno-case-tree: case tree conversion @@ -731,6 +732,7 @@ public: // ACCESSORS (optimization options) bool fAcycSimp() const { return m_fAcycSimp; } bool fAssemble() const { return m_fAssemble; } + bool fBitScanLoops() const { return m_fBitScanLoops; } bool fCaseDecoder() const { return m_fCaseDecoder; } bool fCaseTable() const { return m_fCaseTable; } bool fCaseTree() const { return m_fCaseTree; } diff --git a/src/V3Unroll.cpp b/src/V3Unroll.cpp index b5413e8b6..62cf42fa9 100644 --- a/src/V3Unroll.cpp +++ b/src/V3Unroll.cpp @@ -62,6 +62,8 @@ struct UnrollStats final { Stat m_nPragmaDisabled{"Pragma unroll_disable"}; Stat m_nUnrolledLoops{"Unrolled loops"}; Stat m_nUnrolledIters{"Unrolled iterations"}; + Stat m_bitScanLowered{"Lowered priority-encoder to mostsetbitp1"}; + Stat m_countOnesLowered{"Lowered count-set-bits to countones"}; }; //###################################################################### @@ -422,6 +424,157 @@ class UnrollAllVisitor final : VNVisitor { UnrollStats m_stats; // Statistic tracking UnrolllBindings m_bindings; // Variable bindings + // METHODS + // Peel value-preserving width casts (Extend/ExtendS, or a low-bits Sel with lsb 0) to the + // underlying VarRef. A Sel kept narrower than 'minWidth' is a lossy narrowing (idx[1:0]) + // and is rejected. + static AstVarRef* unwrapToVarRef(AstNodeExpr* nodep, int minWidth) { + while (true) { + if (AstVarRef* const refp = VN_CAST(nodep, VarRef)) return refp; + if (AstExtend* const ep = VN_CAST(nodep, Extend)) { + nodep = ep->lhsp(); + } else if (AstExtendS* const ep = VN_CAST(nodep, ExtendS)) { + nodep = ep->lhsp(); + } else if (AstSel* const sp = VN_CAST(nodep, Sel)) { + const AstConst* const lsbp = VN_CAST(sp->lsbp(), Const); + if (!lsbp || lsbp->toUInt() != 0 || sp->width() < minWidth) return nullptr; + nodep = sp->fromp(); + } else { + return nullptr; + } + } + } + // True if 'nodep' is exactly '1 + var' for 'vscp' (V3Const puts the constant on the LHS). + // Passing the add's width as minWidth rejects a lossy increment like 32'(i[1:0]) + 1. + bool isVarPlus1(AstNode* nodep, const AstVarScope* vscp) { + AstAdd* const addp = VN_CAST(nodep, Add); + if (!addp || !addp->lhsp()->isOne()) return false; + const AstVarRef* const r = unwrapToVarRef(addp->rhsp(), addp->width()); + return r && r->varScopep() == vscp; + } + // Resize the 32-bit reduction to the accumulator width; truncating the low bits matches + // the original counted loop's wrap-around. + static AstNodeExpr* resizeToWidth(AstNodeExpr* exprp, const AstVarRef* targetRefp) { + const int width = targetRefp->width(); + if (width == 32) return exprp; + FileLine* const flp = exprp->fileline(); + if (width < 32) return new AstSel{flp, exprp, 0, width}; + AstExtend* const extp = new AstExtend{flp, exprp}; + extp->dtypeFrom(targetRefp); + return extp; + } + // Match a strict ascending loop bound 'idx < W'. V3Const canonicalizes this to the + // 'W > idx' form (Gt unsigned, GtS signed), so only that form is matched. + static bool ascendingBound(AstNodeExpr* condp, AstConst*& wp, AstVarRef*& idxRefp) { + if (!VN_IS(condp, Gt) && !VN_IS(condp, GtS)) return false; + AstNodeBiop* const bp = VN_AS(condp, NodeBiop); + wp = VN_CAST(bp->lhsp(), Const); + idxRefp = VN_CAST(bp->rhsp(), VarRef); + return wp && idxRefp && !wp->num().isFourState(); + } + // Recognize the redundant in-range guard Verilator auto-inserts for a select into a + // non-power-of-two vector. V3Const canonicalizes 'idx <= C' to '(C >= idx)' (Gte/GteS, + // const on the LHS), so only that form occurs; with C >= W-1 it is always true for idx + // in 0..W-1. + static bool isInRangeGuard(AstNodeExpr* condp, const AstVarScope* idxVscp, uint32_t width, + int addrBits) { + if (!VN_IS(condp, Gte) && !VN_IS(condp, GteS)) return false; + AstNodeBiop* const bp = VN_AS(condp, NodeBiop); + const AstConst* const cp = VN_CAST(bp->lhsp(), Const); + if (!cp || cp->num().isFourState() || cp->toUInt() < width - 1) return false; + const AstVarRef* const r = unwrapToVarRef(bp->rhsp(), addrBits); + return r && r->varScopep() == idxVscp; + } + // Recognize a single-bit scan loop over all W bits of 'vec' (idx 0..W-1, target + // pre-zeroed) and lower it to a bit-reduction primitive. Two idioms are matched: + // target = 0; idx = 0; + // loop { looptest(W > idx); if (...vec[idx]...) target = ; idx = idx + 1; } + // where, when W == width(vec): + // = idx + 1 => target = $mostsetbitp1(vec) (leading-one / bit-width) + // = target + 1 => target = $countones(vec) (population count) + bool tryLowerBitScanLoop(AstLoop* loopp) { + AstLoopTest* const testp = VN_CAST(loopp->stmtsp(), LoopTest); + if (!testp) return false; + AstIf* const ifp = VN_CAST(testp->nextp(), If); + if (!ifp) return false; + AstAssign* const incp = VN_CAST(ifp->nextp(), Assign); + if (!incp || incp->nextp()) return false; + AstConst* wp = nullptr; + AstVarRef* idxRefp = nullptr; + if (!ascendingBound(testp->condp(), wp, idxRefp)) return false; + AstVarScope* const idxVscp = idxRefp->varScopep(); + const uint32_t width = wp->toUInt(); + // Bits needed to address all W bits of 'vec' (clog2(W)); a narrower index is lossy. + const int addrBits = width <= 1 ? 1 : V3Number::log2b(width - 1) + 1; + const AstConst* const idxInitp = m_bindings.get(idxVscp); + if (!idxInitp || !idxInitp->isZero()) return false; + AstVarRef* const incLhsp = VN_CAST(incp->lhsp(), VarRef); + if (!incLhsp || incLhsp->varScopep() != idxVscp) return false; + if (!isVarPlus1(incp->rhsp(), idxVscp)) return false; + if (ifp->elsesp()) return false; + AstAssign* const thenp = VN_CAST(ifp->thensp(), Assign); + if (!thenp || thenp->nextp()) return false; + AstVarRef* const targetRefp = VN_CAST(thenp->lhsp(), VarRef); + if (!targetRefp) return false; + AstVarScope* const targetVscp = targetRefp->varScopep(); + if (targetVscp == idxVscp) return false; + const bool isLeadingOne = isVarPlus1(thenp->rhsp(), idxVscp); + const bool isCountOnes = !isLeadingOne && isVarPlus1(thenp->rhsp(), targetVscp); + if (!isLeadingOne && !isCountOnes) return false; + // If-cond is the 1-bit select 'vec[idx]', possibly wrapped in the redundant in-range + // guard Verilator auto-inserts (as 'guard && sel') for a non-power-of-two vector: + // '(idx <= W-1) && vec[idx]' (default / --x-assign 0; a LogAnd), or + // '(idx <= W-1) ? vec[idx] : ' (--x-assign unique; a Cond). + // The guard is always true for idx in 0..W-1, so peel it to reach the select. Any + // other compound condition (e.g. 'vec[idx] && en') leaves a non-select, rejected below. + AstNodeExpr* condp = ifp->condp(); + if (AstLogAnd* const andp = VN_CAST(condp, LogAnd)) { + if (isInRangeGuard(andp->lhsp(), idxVscp, width, addrBits)) condp = andp->rhsp(); + } else if (AstCond* const ternp = VN_CAST(condp, Cond)) { + if (isInRangeGuard(ternp->condp(), idxVscp, width, addrBits)) condp = ternp->thenp(); + } + AstSel* const selp = VN_CAST(condp, Sel); + if (!selp || selp->width() != 1) return false; + const AstVarRef* const fromp = VN_CAST(selp->fromp(), VarRef); + if (!fromp) return false; + const AstVarScope* const fromVscp = fromp->varScopep(); + if (fromVscp == idxVscp || fromVscp == targetVscp) return false; + AstNodeExpr* const vecExprp = selp->fromp(); + // Must scan all W bits of 'vec', indexed by exactly 'idx' (address kept >= clog2(W), + // so a lossy narrowing like vec[idx[2:0]] is rejected). + if (static_cast(width) != vecExprp->width()) return false; + const AstVarRef* const idxInSel = unwrapToVarRef(selp->lsbp(), addrBits); + if (!idxInSel || idxInSel->varScopep() != idxVscp) return false; + // 'target' must be const-0 immediately before the loop (collected in m_bindings), + // so that an all-zero 'vec' yields 0, matching $mostsetbitp1's definition. + const AstConst* const targetInitp = m_bindings.get(targetVscp); + if (!targetInitp || !targetInitp->isZero()) return false; + // Rewrite to 'target = (vec); idx = W'. The 'idx = W' store preserves the + // loop's exit value, so this is sound even if idx is read afterwards (else DCE drops it). + FileLine* const flp = loopp->fileline(); + AstNodeExpr* reducep; + if (isLeadingOne) { + reducep = new AstMostSetBitP1{flp, vecExprp->cloneTree(false)}; + } else { + AstCountOnes* const conep = new AstCountOnes{flp, vecExprp->cloneTree(false)}; + conep->dtypeSetInteger2State(); + reducep = conep; + } + reducep = resizeToWidth(reducep, targetRefp); + AstAssign* const newp = new AstAssign{flp, targetRefp->cloneTree(false), reducep}; + newp->addNext(new AstAssign{flp, incLhsp->cloneTree(false), wp->cloneTree(false)}); + loopp->replaceWith(newp); + VL_DO_DANGLING(pushDeletep(loopp), loopp); + if (isLeadingOne) { + UINFO(4, "Lowered priority-encoder loop to $mostsetbitp1: " << newp); + ++m_stats.m_bitScanLowered; + } else { + UINFO(4, "Lowered count-set-bits loop to $countones: " << newp); + ++m_stats.m_countOnesLowered; + } + return true; + } + // VISIT void visit(AstLoop* nodep) override { // Gather variable bindings from the preceding statements @@ -450,6 +603,9 @@ class UnrollAllVisitor final : VNVisitor { m_bindings.set(lhsp->varScopep(), valp); } + // Recognize a bit counting loop and lower it to a builtin + if (v3Global.opt.fBitScanLoops() && tryLowerBitScanLoop(nodep)) return; + // Attempt to unroll this loop const std::pair pair = UnrollOneVisitor::apply(m_stats, m_bindings, nodep); diff --git a/test_regress/t/t_bit_scan_loops.py b/test_regress/t/t_bit_scan_loops.py new file mode 100755 index 000000000..8d0b83c51 --- /dev/null +++ b/test_regress/t/t_bit_scan_loops.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of either the GNU Lesser General Public License Version 3 +# or the Perl Artistic License Version 2.0. +# SPDX-FileCopyrightText: 2026 Wilson Snyder +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +# --unroll-count 0 so the loops are recognized without relying on unrolling. +test.compile(verilator_flags2=['--stats', '--unroll-count', '0']) + +# The leading-one positives lower to $mostsetbitp1, the count-ones positive to +# $countones; the negatives are left as loops (a wrong lowering would raise a count). +test.file_grep(test.stats, + r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)', + 8) +test.file_grep(test.stats, + r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)', + 1) + +test.execute() + +test.passes() diff --git a/test_regress/t/t_bit_scan_loops.v b/test_regress/t/t_bit_scan_loops.v new file mode 100644 index 000000000..fa2c1df00 --- /dev/null +++ b/test_regress/t/t_bit_scan_loops.v @@ -0,0 +1,169 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// Exercises the bit-counting loop idioms that V3Unroll lowers to builtins: +// leading-one for (b=0;b $mostsetbitp1(vec) +// count-ones for (b=0;b $countones(vec) +// Positives must lower (counted via --stats by the .py); negatives compute a +// different value than the builtin and so must be left as loops. +// +// This file ONLY is placed under the Creative Commons Public Domain. +// SPDX-FileCopyrightText: 2026 Wilson Snyder +// SPDX-License-Identifier: CC0-1.0 + +// verilog_format: off +`define stop $stop +`define checkh(gotv,expv) do if ((gotv) !== (expv)) begin $write("%%Error: %s:%0d: got=%0x exp=%0x (%s !== %s)\n", `__FILE__,`__LINE__, (gotv), (expv), `"gotv`", `"expv`"); `stop; end while(0); +// verilog_format: on + +module t ( + input clk +); + + // ---- positives: must lower ---- + logic [31:0] p32; + logic [5:0] n32; // I path, narrow target (select resize) + logic [47:0] p48; + logic [6:0] n48; // Q path + logic [79:0] p80; + logic [6:0] n80; // W path + logic [31:0] pu; + logic [5:0] nu; // unsigned loop index + logic [31:0] p32e; + logic [31:0] n32e; // 32-bit target (no resize) + logic [31:0] p32w; + logic [39:0] n40; // >32-bit target (extend resize) + logic [31:0] pc; + logic [5:0] nc; // count-ones -> $countones + logic [31:0] kvec; // const (set in initial) -> exercises $mostsetbitp1 fold + logic [5:0] kn; + initial kvec = 32'h0000_0100; + logic [31:0] kvec0; // const 0 -> $mostsetbitp1(0)=0 (covers the zero path) + logic [5:0] kn0; + initial kvec0 = 32'h0; + always_comb begin + n32 = 0; + for (int b = 0; b < 32; b++) if (p32[b]) n32 = 6'(b + 1); + end + always_comb begin + n48 = 0; + for (int b = 0; b < 48; b++) if (p48[b]) n48 = 7'(b + 1); + end + always_comb begin + n80 = 0; + for (int b = 0; b < 80; b++) if (p80[b]) n80 = 7'(b + 1); + end + always_comb begin + nu = 0; + for (int unsigned b = 0; b < 32; b++) if (pu[b]) nu = 6'(b + 1); + end + always_comb begin + n32e = 0; + for (int b = 0; b < 32; b++) if (p32e[b]) n32e = 32'(b + 1); + end + always_comb begin + n40 = 0; + for (int b = 0; b < 32; b++) if (p32w[b]) n40 = 40'(b + 1); + end + always_comb begin + nc = 0; + for (int b = 0; b < 32; b++) if (pc[b]) nc = nc + 1; + end + always_comb begin + kn = 0; + for (int b = 0; b < 32; b++) if (kvec[b]) kn = 6'(b + 1); + end + always_comb begin + kn0 = 0; + for (int b = 0; b < 32; b++) if (kvec0[b]) kn0 = 6'(b + 1); + end + + // ---- negatives: must NOT lower (each yields a different value than the builtin) ---- + logic [31:0] vn; // shared input, bits {2,4,5,7} + logic [31:0] vw; // has a set bit above the scan bound + logic [31:0] vt; // for the truncated-index case + logic en1; // runtime gate for the compound-condition case + logic [5:0] e_step2; + logic [6:0] e_start1; + logic [6:0] e_mul; + logic [5:0] e_off; + logic [5:0] e_noP1; + logic [5:0] e_narrow; + logic [5:0] e_comp; + logic [5:0] e_trunc; + always_comb begin + e_step2 = 0; + for (int b = 0; b < 32; b += 2) if (vn[b]) e_step2 = 6'(b + 1); + end + always_comb begin + e_start1 = 0; + for (int b = 1; b < 32; b++) if (vn[b]) e_start1 = 7'(b + 1); + end + always_comb begin + e_mul = 0; + for (int b = 0; b < 32; b++) if (vn[b]) e_mul = 7'(2 * b + 1); + end + always_comb begin + e_off = 0; + for (int b = 0; b < 31; b++) if (vn[b+1]) e_off = 6'(b + 1); + end + always_comb begin + e_noP1 = 0; + for (int b = 0; b < 32; b++) if (vn[b]) e_noP1 = 6'(b); + end + always_comb begin + e_narrow = 0; + for (int b = 0; b < 16; b++) if (vw[b]) e_narrow = 6'(b + 1); + end + always_comb begin + e_comp = 0; + for (int b = 0; b < 32; b++) if (vn[b] && en1) e_comp = 6'(b + 1); + end + // verilator lint_off WIDTHEXPAND + always_comb begin + e_trunc = 0; + for (int b = 0; b < 32; b++) if (vt[b[2:0]]) e_trunc = 6'(b + 1); + end + // verilator lint_on WIDTHEXPAND + + int cyc = 0; + always @(posedge clk) begin + cyc <= cyc + 1; + if (cyc == 0) begin + p32 <= 32'h8000_0000; + p48 <= 48'h0; + p48[47] <= 1'b1; + p80 <= 80'h0; + p80[79] <= 1'b1; + pu <= 32'h0001_0000; // bit 16 + p32e <= 32'h8000_0000; + p32w <= 32'h8000_0000; + pc <= 32'hf0f0_f0f0; // 16 ones + vn <= 32'h0000_00b4; // bits {2,4,5,7} + vw <= 32'h0010_0008; // bits {3,20} + vt <= 32'h0000_0080; // bit 7 + en1 <= 1'b0; // gate off -> compound loop yields 0 + end + else if (cyc == 1) begin + `checkh(n32, 6'd32); + `checkh(n48, 7'd48); + `checkh(n80, 7'd80); + `checkh(nu, 6'd17); // unsigned-index leading-one, bit 16 -> 17 + `checkh(n32e, 32'd32); + `checkh(n40, 40'd32); + `checkh(nc, 6'd16); // popcount(0xF0F0F0F0) + `checkh(kn, 6'd9); // mostsetbitp1(0x100), constant-folded + `checkh(kn0, 6'd0); // mostsetbitp1(0)=0, constant-folded (zero path) + // negatives, hand-computed for vn = 0xB4 (bits 2,4,5,7): + `checkh(e_step2, 6'd5); // highest even set bit (4) + 1 + `checkh(e_start1, 7'd8); // highest set bit in [1,32) (7) + 1 + `checkh(e_mul, 7'd15); // 2*7 + 1 + `checkh(e_off, 6'd7); // idx where vec[idx+1]; highest 6 -> 7 + `checkh(e_noP1, 6'd7); // highest set bit (7), no +1 + `checkh(e_narrow, 6'd4); // W=16 != width(vec): only low bits scanned (bit 3) + `checkh(e_comp, 6'd0); // && en1 (=0); a wrong lowering would give 8 + `checkh(e_trunc, 6'd32); // vt[b[2:0]] last hits b=31; a wrong lowering would give 8 + $write("*-* All Finished *-*\n"); + $finish; + end + end +endmodule diff --git a/test_regress/t/t_bit_scan_loops_off.py b/test_regress/t/t_bit_scan_loops_off.py new file mode 100755 index 000000000..cdf34ea55 --- /dev/null +++ b/test_regress/t/t_bit_scan_loops_off.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of either the GNU Lesser General Public License Version 3 +# or the Perl Artistic License Version 2.0. +# SPDX-FileCopyrightText: 2026 Wilson Snyder +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +# Reuse the same design; only the optimization switch differs. +test.top_filename = "t/t_bit_scan_loops.v" + +test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '-fno-bit-scan-loops']) + +# With the optimization disabled, nothing lowers. +test.file_grep(test.stats, r'Lowered priority-encoder to mostsetbitp1\s+([0-9])', 0) + +test.execute() + +test.passes() diff --git a/test_regress/t/t_bit_scan_loops_xassign0.py b/test_regress/t/t_bit_scan_loops_xassign0.py new file mode 100755 index 000000000..083a30cba --- /dev/null +++ b/test_regress/t/t_bit_scan_loops_xassign0.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of either the GNU Lesser General Public License Version 3 +# or the Perl Artistic License Version 2.0. +# SPDX-FileCopyrightText: 2026 Wilson Snyder +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt') + +# Reuse the same design. '--x-assign 0' makes the auto-inserted out-of-range guard on a +# non-power-of-two bit-select a plain '(idx <= W-1) && vec[idx]' (AstLogAnd), rather than +# the ternary '(idx <= W-1) ? vec[idx] : ' (AstCond) produced under the driver's default +# '--x-assign unique'. This exercises the matcher's other guard-peel branch. +test.top_filename = "t/t_bit_scan_loops.v" + +test.compile(verilator_flags2=['--stats', '--unroll-count', '0', '--x-assign', '0']) + +# Same lowering counts as the default run -- only the guard shape differs, not the result. +test.file_grep(test.stats, + r'Optimizations, Loop unrolling, Lowered priority-encoder to mostsetbitp1\s+(\d+)', + 8) +test.file_grep(test.stats, + r'Optimizations, Loop unrolling, Lowered count-set-bits to countones\s+(\d+)', + 1) + +test.execute() + +test.passes()