From b90fce55f418d35587383d3bc08ad7029774ffca Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Sat, 24 Jul 2021 10:00:33 -0400
Subject: [PATCH] Includes: Refactor verilated.h and deprecate
 verilated_heavy.h (#2701).

---
 Changes                           |    1 +
 docs/guide/deprecations.rst       |    5 +
 include/verilated.cpp             |    1 -
 include/verilated.h               | 2151 +--------------------------
 include/verilated_dpi.h           |    1 -
 include/verilated_funcs.h         | 2252 +++++++++++++++++++++++++++++
 include/verilated_heavy.h         |  960 +-----------
 include/verilated_imp.h           |    1 -
 include/verilated_save.h          |    2 +-
 include/verilated_syms.h          |    2 +-
 include/verilated_types.h         |  897 ++++++++++++
 src/V3EmitCConstPool.cpp          |    2 +-
 src/V3EmitCHeaders.cpp            |    2 +-
 src/V3EmitCImp.cpp                |    2 +-
 src/V3EmitCModel.cpp              |    2 +-
 src/V3EmitCSyms.cpp               |    2 +-
 test_regress/t/t_verilated_all.pl |    1 +
 17 files changed, 3183 insertions(+), 3101 deletions(-)
 create mode 100644 include/verilated_funcs.h
 create mode 100644 include/verilated_types.h
diff --git a/Changes b/Changes
index e2e7cf0da..40413b760 100644
--- a/Changes
+++ b/Changes
@@ -18,6 +18,7 @@ Verilator 4.211 devel
   in order to aid incremental compilation via ccache (#3071). [Geza Lore]
 * Parameter values are now emitted as 'static constexpr' instead of enum.
   C++ direct references to parameters might require updating (#3077). [Geza Lore]
+* Refactored Verilated include files; include verilated.h not verilated_heavy.h.
 * Fix -G to treat simple integer literals as signed (#3060). [Anikin1610]
 * Fix emitted string array initializers (#2895). [Iztok Jeras]
 
diff --git a/docs/guide/deprecations.rst b/docs/guide/deprecations.rst
index aa9fcb5df..8d3124175 100644
--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@@ -11,6 +11,11 @@ C++11 compiler support
   require C++14 or newer compilers for both compiling Verilator and
   compiling Verilated models no sooner than January 2022.
 
+Verilated_heavy.h
+  The legacy "verilated_heavy.h" include was replaced with just including
+  "verilated.h". Verilated_heavy.h is planned for removal no sooner than
+  July 2022.
+
 Configuration File -msg
   The :vlopt:`lint_off` "-msg" option has been replaced with the "-rule"
   option.  "-msg" is planned for removal no sooner than January 2021.
diff --git a/include/verilated.cpp b/include/verilated.cpp
index 375647d2d..e01f32a78 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -1347,7 +1347,6 @@ IData VL_FGETS_IXI(int obits, void* destp, IData fpi) VL_MT_SAFE {
     return got;
 }
 
-// declared in verilated_heavy.h
 IData VL_FGETS_NI(std::string& dest, IData fpi) VL_MT_SAFE {
     return getLine(dest, fpi, std::numeric_limits<size_t>::max());
 }
diff --git a/include/verilated.h b/include/verilated.h
index f71f13308..1866e34fb 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -29,6 +29,7 @@
 
 #ifndef VERILATOR_VERILATED_H_
 #define VERILATOR_VERILATED_H_
+#define VERILATOR_VERILATED_H_INTERNAL_
 
 // clang-format off
 #include "verilatedos.h"
@@ -36,18 +37,22 @@
 # include "verilated_sc.h"  // Get SYSTEMC_VERSION and time declarations
 #endif
 
+#include <algorithm>
+#include <array>
 #include <cassert>
 #include <cmath>
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <deque>
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_set>
 #include <vector>
 // <iostream> avoided to reduce compile time
-// <map> avoided and instead in verilated_heavy.h to reduce compile time
-// <string> avoided and instead in verilated_heavy.h to reduce compile time
 #ifdef VL_THREADED
 # include <atomic>
 # include <mutex>
@@ -259,31 +264,7 @@ public:
     const char* name() const { return m_namep; }  ///< Return name of module
 };
 
-//=========================================================================
-// Declare nets
-
-#define VL_SIG8(name, msb, lsb) CData name  ///< Declare signal, 1-8 bits
-#define VL_SIG16(name, msb, lsb) SData name  ///< Declare signal, 9-16 bits
-#define VL_SIG64(name, msb, lsb) QData name  ///< Declare signal, 33-64 bits
-#define VL_SIG(name, msb, lsb) IData name  ///< Declare signal, 17-32 bits
-#define VL_SIGW(name, msb, lsb, words) WData name[words]  ///< Declare signal, 65+ bits
-#define VL_IN8(name, msb, lsb) CData name  ///< Declare input signal, 1-8 bits
-#define VL_IN16(name, msb, lsb) SData name  ///< Declare input signal, 9-16 bits
-#define VL_IN64(name, msb, lsb) QData name  ///< Declare input signal, 33-64 bits
-#define VL_IN(name, msb, lsb) IData name  ///< Declare input signal, 17-32 bits
-#define VL_INW(name, msb, lsb, words) WData name[words]  ///< Declare input signal, 65+ bits
-#define VL_INOUT8(name, msb, lsb) CData name  ///< Declare bidir signal, 1-8 bits
-#define VL_INOUT16(name, msb, lsb) SData name  ///< Declare bidir signal, 9-16 bits
-#define VL_INOUT64(name, msb, lsb) QData name  ///< Declare bidir signal, 33-64 bits
-#define VL_INOUT(name, msb, lsb) IData name  ///< Declare bidir signal, 17-32 bits
-#define VL_INOUTW(name, msb, lsb, words) WData name[words]  ///< Declare bidir signal, 65+ bits
-#define VL_OUT8(name, msb, lsb) CData name  ///< Declare output signal, 1-8 bits
-#define VL_OUT16(name, msb, lsb) SData name  ///< Declare output signal, 9-16 bits
-#define VL_OUT64(name, msb, lsb) QData name  ///< Declare output signal, 33-64bits
-#define VL_OUT(name, msb, lsb) IData name  ///< Declare output signal, 17-32 bits
-#define VL_OUTW(name, msb, lsb, words) WData name[words]  ///< Declare output signal, 65+ bits
-
-///< Declare a module, ala SC_MODULE
+/// Declare a module, ala SC_MODULE
 #define VL_MODULE(modname) class modname VL_NOT_FINAL : public VerilatedModule
 // Not class final in VL_MODULE, as users might be abstracting our models (--hierarchical)
 
@@ -887,2122 +868,16 @@ inline void VerilatedContext::debug(int val) VL_MT_SAFE { Verilated::debug(val);
 inline int VerilatedContext::debug() VL_MT_SAFE { return Verilated::debug(); }
 
 //=========================================================================
-// Extern functions -- User may override -- See verilated.cpp
+// Data Types
 
-/// Routine to call for $finish
-/// User code may wish to replace this function, to do so, define VL_USER_FINISH.
-/// This code does not have to be thread safe.
-/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
-extern void vl_finish(const char* filename, int linenum, const char* hier);
-
-/// Routine to call for $stop and non-fatal error
-/// User code may wish to replace this function, to do so, define VL_USER_STOP.
-/// This code does not have to be thread safe.
-/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
-extern void vl_stop(const char* filename, int linenum, const char* hier);
-
-/// Routine to call for a couple of fatal messages
-/// User code may wish to replace this function, to do so, define VL_USER_FATAL.
-/// This code does not have to be thread safe.
-/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
-extern void vl_fatal(const char* filename, int linenum, const char* hier, const char* msg);
+#include "verilated_types.h"
 
 //=========================================================================
-// Extern functions -- Slow path
+// Functions
 
-/// Multithread safe wrapper for calls to $finish
-extern void VL_FINISH_MT(const char* filename, int linenum, const char* hier) VL_MT_SAFE;
-/// Multithread safe wrapper for calls to $stop
-extern void VL_STOP_MT(const char* filename, int linenum, const char* hier,
-                       bool maybe = true) VL_MT_SAFE;
-/// Multithread safe wrapper to call for a couple of fatal messages
-extern void VL_FATAL_MT(const char* filename, int linenum, const char* hier,
-                        const char* msg) VL_MT_SAFE;
-
-// clang-format off
-/// Print a string, multithread safe. Eventually VL_PRINTF will get called.
-#ifdef VL_THREADED
-extern void VL_PRINTF_MT(const char* formatp, ...) VL_ATTR_PRINTF(1) VL_MT_SAFE;
-#else
-# define VL_PRINTF_MT VL_PRINTF  // The following parens will take care of themselves
-#endif
-// clang-format on
-
-/// Print a debug message from internals with standard prefix, with printf style format
-extern void VL_DBG_MSGF(const char* formatp, ...) VL_ATTR_PRINTF(1) VL_MT_SAFE;
-
-extern vluint64_t vl_rand64() VL_MT_SAFE;
-inline IData VL_RANDOM_I(int obits) VL_MT_SAFE { return vl_rand64() & VL_MASK_I(obits); }
-inline QData VL_RANDOM_Q(int obits) VL_MT_SAFE { return vl_rand64() & VL_MASK_Q(obits); }
-#ifndef VL_NO_LEGACY
-extern WDataOutP VL_RANDOM_W(int obits, WDataOutP outwp);
-#endif
-extern IData VL_RANDOM_SEEDED_II(int obits, IData seed) VL_MT_SAFE;
-inline IData VL_URANDOM_RANGE_I(IData hi, IData lo) {
-    vluint64_t rnd = vl_rand64();
-    if (VL_LIKELY(hi > lo)) {
-        // Modulus isn't very fast but it's common that hi-low is power-of-two
-        return (rnd % (hi - lo + 1)) + lo;
-    } else {
-        return (rnd % (lo - hi + 1)) + hi;
-    }
-}
-
-// These are init time only, so slow is fine
-/// Random reset a signal of given width
-extern IData VL_RAND_RESET_I(int obits);
-/// Random reset a signal of given width
-extern QData VL_RAND_RESET_Q(int obits);
-/// Random reset a signal of given width
-extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);
-/// Zero reset a signal (slow - else use VL_ZERO_W)
-extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);
-
-#if VL_THREADED
-/// Return high-precision counter for profiling, or 0x0 if not available
-inline QData VL_RDTSC_Q() {
-    vluint64_t val;
-    VL_RDTSC(val);
-    return val;
-}
-#endif
-
-extern void VL_PRINTTIMESCALE(const char* namep, const char* timeunitp,
-                              const VerilatedContext* contextp) VL_MT_SAFE;
-
-extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP const lwp, WDataInP const rwp,
-                              bool is_modulus);
-
-extern IData VL_FGETS_IXI(int obits, void* destp, IData fpi);
-
-extern void VL_FFLUSH_I(IData fdi);
-extern IData VL_FSEEK_I(IData fdi, IData offset, IData origin);
-extern IData VL_FTELL_I(IData fdi);
-extern void VL_FCLOSE_I(IData fdi);
-
-extern IData VL_FREAD_I(int width, int array_lsb, int array_size, void* memp, IData fpi,
-                        IData start, IData count);
-
-extern void VL_WRITEF(const char* formatp, ...);
-extern void VL_FWRITEF(IData fpi, const char* formatp, ...);
-
-extern IData VL_FSCANF_IX(IData fpi, const char* formatp, ...);
-extern IData VL_SSCANF_IIX(int lbits, IData ld, const char* formatp, ...);
-extern IData VL_SSCANF_IQX(int lbits, QData ld, const char* formatp, ...);
-extern IData VL_SSCANF_IWX(int lbits, WDataInP const lwp, const char* formatp, ...);
-
-extern void VL_SFORMAT_X(int obits, CData& destr, const char* formatp, ...);
-extern void VL_SFORMAT_X(int obits, SData& destr, const char* formatp, ...);
-extern void VL_SFORMAT_X(int obits, IData& destr, const char* formatp, ...);
-extern void VL_SFORMAT_X(int obits, QData& destr, const char* formatp, ...);
-extern void VL_SFORMAT_X(int obits, void* destp, const char* formatp, ...);
-
-extern IData VL_SYSTEM_IW(int lhswords, WDataInP const lhsp);
-extern IData VL_SYSTEM_IQ(QData lhs);
-inline IData VL_SYSTEM_II(IData lhs) VL_MT_SAFE { return VL_SYSTEM_IQ(lhs); }
-
-extern IData VL_TESTPLUSARGS_I(const char* formatp);
-extern const char* vl_mc_scan_plusargs(const char* prefixp);  // PLIish
-
-//=========================================================================
-// Base macros
-
-// Return true if data[bit] set; not 0/1 return, but 0/non-zero return.
-#define VL_BITISSET_I(data, bit) ((data) & (VL_UL(1) << VL_BITBIT_I(bit)))
-#define VL_BITISSET_Q(data, bit) ((data) & (1ULL << VL_BITBIT_Q(bit)))
-#define VL_BITISSET_E(data, bit) ((data) & (VL_EUL(1) << VL_BITBIT_E(bit)))
-#define VL_BITISSET_W(data, bit) ((data)[VL_BITWORD_E(bit)] & (VL_EUL(1) << VL_BITBIT_E(bit)))
-#define VL_BITISSETLIMIT_W(data, width, bit) (((bit) < (width)) && VL_BITISSET_W(data, bit))
-
-// Shift appropriate word by bit. Does not account for wrapping between two words
-#define VL_BITRSHIFT_W(data, bit) ((data)[VL_BITWORD_E(bit)] >> VL_BITBIT_E(bit))
-
-// Create two 32-bit words from quadword
-// WData is always at least 2 words; does not clean upper bits
-#define VL_SET_WQ(owp, data) \
-    do { \
-        (owp)[0] = static_cast<IData>(data); \
-        (owp)[1] = static_cast<IData>((data) >> VL_EDATASIZE); \
-    } while (false)
-#define VL_SET_WI(owp, data) \
-    do { \
-        (owp)[0] = static_cast<IData>(data); \
-        (owp)[1] = 0; \
-    } while (false)
-#define VL_SET_QW(lwp) \
-    ((static_cast<QData>((lwp)[0])) \
-     | (static_cast<QData>((lwp)[1]) << (static_cast<QData>(VL_EDATASIZE))))
-#define VL_SET_QII(ld, rd) ((static_cast<QData>(ld) << 32ULL) | static_cast<QData>(rd))
-
-// Return FILE* from IData
-extern FILE* VL_CVT_I_FP(IData lhs) VL_MT_SAFE;
-
-// clang-format off
-// Use a union to avoid cast-to-different-size warnings
-// Return void* from QData
-static inline void* VL_CVT_Q_VP(QData lhs) VL_PURE {
-    union { void* fp; QData q; } u;
-    u.q = lhs;
-    return u.fp;
-}
-// Return QData from const void*
-static inline QData VL_CVT_VP_Q(const void* fp) VL_PURE {
-    union { const void* fp; QData q; } u;
-    u.q = 0;
-    u.fp = fp;
-    return u.q;
-}
-// Return double from QData (bits, not numerically)
-static inline double VL_CVT_D_Q(QData lhs) VL_PURE {
-    union { double d; QData q; } u;
-    u.q = lhs;
-    return u.d;
-}
-// Return QData from double (bits, not numerically)
-static inline QData VL_CVT_Q_D(double lhs) VL_PURE {
-    union { double d; QData q; } u;
-    u.d = lhs;
-    return u.q;
-}
-// clang-format on
-
-// Return double from lhs (numeric) unsigned
-double VL_ITOR_D_W(int lbits, WDataInP const lwp) VL_PURE;
-static inline double VL_ITOR_D_I(int, IData lhs) VL_PURE {
-    return static_cast<double>(static_cast<vluint32_t>(lhs));
-}
-static inline double VL_ITOR_D_Q(int, QData lhs) VL_PURE {
-    return static_cast<double>(static_cast<vluint64_t>(lhs));
-}
-// Return double from lhs (numeric) signed
-double VL_ISTOR_D_W(int lbits, WDataInP const lwp) VL_PURE;
-static inline double VL_ISTOR_D_I(int lbits, IData lhs) VL_PURE {
-    if (lbits == 32) return static_cast<double>(static_cast<vlsint32_t>(lhs));
-    WData lwp[VL_WQ_WORDS_E];
-    VL_SET_WI(lwp, lhs);
-    return VL_ISTOR_D_W(lbits, lwp);
-}
-static inline double VL_ISTOR_D_Q(int lbits, QData lhs) VL_PURE {
-    if (lbits == 64) return static_cast<double>(static_cast<vlsint64_t>(lhs));
-    WData lwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(lwp, lhs);
-    return VL_ISTOR_D_W(lbits, lwp);
-}
-// Return QData from double (numeric)
-static inline IData VL_RTOI_I_D(double lhs) VL_PURE {
-    return static_cast<vlsint32_t>(VL_TRUNC(lhs));
-}
-
-// Sign extend such that if MSB set, we get ffff_ffff, else 0s
-// (Requires clean input)
-#define VL_SIGN_I(nbits, lhs) ((lhs) >> VL_BITBIT_I((nbits)-VL_UL(1)))
-#define VL_SIGN_Q(nbits, lhs) ((lhs) >> VL_BITBIT_Q((nbits)-1ULL))
-#define VL_SIGN_E(nbits, lhs) ((lhs) >> VL_BITBIT_E((nbits)-VL_EUL(1)))
-#define VL_SIGN_W(nbits, rwp) \
-    ((rwp)[VL_BITWORD_E((nbits)-VL_EUL(1))] >> VL_BITBIT_E((nbits)-VL_EUL(1)))
-#define VL_SIGNONES_E(nbits, lhs) (-(VL_SIGN_E(nbits, lhs)))
-
-// Sign bit extended up to MSB, doesn't include unsigned portion
-// Optimization bug in GCC 3.3 returns different bitmasks to later states for
-static inline IData VL_EXTENDSIGN_I(int lbits, IData lhs) VL_PURE {
-    return (-((lhs) & (VL_UL(1) << (lbits - 1))));
-}
-static inline QData VL_EXTENDSIGN_Q(int lbits, QData lhs) VL_PURE {
-    return (-((lhs) & (1ULL << (lbits - 1))));
-}
-
-// Debugging prints
-extern void _vl_debug_print_w(int lbits, WDataInP const iwp);
-
-//=========================================================================
-// Pli macros
-
-extern int VL_TIME_STR_CONVERT(const char* strp) VL_PURE;
-
-// These are deprecated and used only to establish the default precision/units.
-// Use Verilator timescale-override for better control.
-// clang-format off
-#ifndef VL_TIME_PRECISION
-# ifdef VL_TIME_PRECISION_STR
-#  define VL_TIME_PRECISION VL_TIME_STR_CONVERT(VL_STRINGIFY(VL_TIME_PRECISION_STR))
-# else
-#  define VL_TIME_PRECISION (-12)  ///< Timescale default units if not in Verilog - picoseconds
-# endif
-#endif
-#ifndef VL_TIME_UNIT
-# ifdef VL_TIME_UNIT_STR
-#  define VL_TIME_UNIT VL_TIME_STR_CONVERT(VL_STRINGIFY(VL_TIME_PRECISION_STR))
-# else
-#  define VL_TIME_UNIT (-12)  ///< Timescale default units if not in Verilog - picoseconds
-# endif
-#endif
-
-#if defined(SYSTEMC_VERSION)
-/// Return current simulation time
-// Already defined: extern sc_time sc_time_stamp();
-inline vluint64_t vl_time_stamp64() { return sc_time_stamp().value(); }
-#else  // Non-SystemC
-# if !defined(VL_TIME_CONTEXT) && !defined(VL_NO_LEGACY)
-#  ifdef VL_TIME_STAMP64
-// vl_time_stamp64() may be optionally defined by the user to return time.
-// On MSVC++ weak symbols are not supported so must be declared, or define
-// VL_TIME_CONTEXT.
-extern vluint64_t vl_time_stamp64() VL_ATTR_WEAK;
-#  else
-// sc_time_stamp() may be optionally defined by the user to return time.
-// On MSVC++ weak symbols are not supported so must be declared, or define
-// VL_TIME_CONTEXT.
-extern double sc_time_stamp() VL_ATTR_WEAK;  // Verilator 4.032 and newer
-inline vluint64_t vl_time_stamp64() {
-    // clang9.0.1 requires & although we really do want the weak symbol value
-    return VL_LIKELY(&sc_time_stamp) ? static_cast<vluint64_t>(sc_time_stamp()) : 0;
-}
-#  endif
-# endif
-#endif
-
-inline vluint64_t VerilatedContext::time() const VL_MT_SAFE {
-    // When using non-default context, fastest path is return time
-    if (VL_LIKELY(m_s.m_time)) return m_s.m_time;
-#if defined(SYSTEMC_VERSION) || (!defined(VL_TIME_CONTEXT) && !defined(VL_NO_LEGACY))
-    // Zero time could mean really at zero, or using callback
-    // clang9.0.1 requires & although we really do want the weak symbol value
-    if (VL_LIKELY(&vl_time_stamp64)) {  // else is weak symbol that is not defined
-        return vl_time_stamp64();
-    }
-#endif
-    return 0;
-}
-
-#define VL_TIME_Q() (Verilated::threadContextp()->time())
-#define VL_TIME_D() (static_cast<double>(VL_TIME_Q()))
-
-// Time scaled from 1-per-precision into a module's time units ("Unit"-ed, not "United")
-// Optimized assuming scale is always constant.
-// Can't use multiply in Q flavor, as might lose precision
-#define VL_TIME_UNITED_Q(scale) (VL_TIME_Q() / static_cast<QData>(scale))
-#define VL_TIME_UNITED_D(scale) (VL_TIME_D() / static_cast<double>(scale))
-
-// Return time precision as multiplier of time units
-double vl_time_multiplier(int scale) VL_PURE;
-// Return power of 10. e.g. returns 100 if n==2
-vluint64_t vl_time_pow10(int n) VL_PURE;
-
-#ifdef VL_DEBUG
-/// Evaluate statement if Verilated::debug() enabled
-# define VL_DEBUG_IF(stmt) \
-    do { \
-        if (VL_UNLIKELY(Verilated::debug())) {stmt} \
-    } while (false)
-#else
-// We intentionally do not compile the stmt to improve compile speed
-# define VL_DEBUG_IF(stmt) do {} while (false)
-#endif
-
-// clang-format on
-
-//=========================================================================
-// Functional macros/routines
-// These all take the form
-//      VL_func_IW(bits, bits, op, op)
-//      VL_func_WW(bits, bits, out, op, op)
-// The I/W indicates if it's a integer or wide for the output and each operand.
-// The bits indicate the bit width of the output and each operand.
-// If wide output, a temporary storage location is specified.
-
-//===================================================================
-// SETTING OPERATORS
-
-// Output clean
-// EMIT_RULE: VL_CLEAN:  oclean=clean; obits=lbits;
-#define VL_CLEAN_II(obits, lbits, lhs) ((lhs)&VL_MASK_I(obits))
-#define VL_CLEAN_QQ(obits, lbits, lhs) ((lhs)&VL_MASK_Q(obits))
-
-// EMIT_RULE: VL_ASSIGNCLEAN:  oclean=clean; obits==lbits;
-#define VL_ASSIGNCLEAN_W(obits, owp, lwp) VL_CLEAN_WW((obits), (obits), (owp), (lwp))
-static inline WDataOutP _vl_clean_inplace_w(int obits, WDataOutP owp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    owp[words - 1] &= VL_MASK_E(obits);
-    return owp;
-}
-static inline WDataOutP VL_CLEAN_WW(int obits, int, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    for (int i = 0; (i < (words - 1)); ++i) owp[i] = lwp[i];
-    owp[words - 1] = lwp[words - 1] & VL_MASK_E(obits);
-    return owp;
-}
-static inline WDataOutP VL_ZERO_W(int obits, WDataOutP owp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    for (int i = 0; i < words; ++i) owp[i] = 0;
-    return owp;
-}
-static inline WDataOutP VL_ALLONES_W(int obits, WDataOutP owp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    for (int i = 0; i < (words - 1); ++i) owp[i] = ~VL_EUL(0);
-    owp[words - 1] = VL_MASK_E(obits);
-    return owp;
-}
-
-// EMIT_RULE: VL_ASSIGN:  oclean=rclean; obits==lbits;
-// For now, we always have a clean rhs.
-// Note: If a ASSIGN isn't clean, use VL_ASSIGNCLEAN instead to do the same thing.
-static inline WDataOutP VL_ASSIGN_W(int obits, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    for (int i = 0; i < words; ++i) owp[i] = lwp[i];
-    return owp;
-}
-
-// EMIT_RULE: VL_ASSIGNBIT:  rclean=clean;
-static inline void VL_ASSIGNBIT_II(int, int bit, CData& lhsr, IData rhs) VL_PURE {
-    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_II(int, int bit, SData& lhsr, IData rhs) VL_PURE {
-    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_II(int, int bit, IData& lhsr, IData rhs) VL_PURE {
-    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_QI(int, int bit, QData& lhsr, QData rhs) VL_PURE {
-    lhsr = ((lhsr & ~(1ULL << VL_BITBIT_Q(bit))) | (static_cast<QData>(rhs) << VL_BITBIT_Q(bit)));
-}
-static inline void VL_ASSIGNBIT_WI(int, int bit, WDataOutP owp, IData rhs) VL_MT_SAFE {
-    EData orig = owp[VL_BITWORD_E(bit)];
-    owp[VL_BITWORD_E(bit)] = ((orig & ~(VL_EUL(1) << VL_BITBIT_E(bit)))
-                              | (static_cast<EData>(rhs) << VL_BITBIT_E(bit)));
-}
-// Alternative form that is an instruction faster when rhs is constant one.
-static inline void VL_ASSIGNBIT_IO(int, int bit, CData& lhsr, IData) VL_PURE {
-    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_IO(int, int bit, SData& lhsr, IData) VL_PURE {
-    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_IO(int, int bit, IData& lhsr, IData) VL_PURE {
-    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
-}
-static inline void VL_ASSIGNBIT_QO(int, int bit, QData& lhsr, IData) VL_PURE {
-    lhsr = (lhsr | (1ULL << VL_BITBIT_Q(bit)));
-}
-static inline void VL_ASSIGNBIT_WO(int, int bit, WDataOutP owp, IData) VL_MT_SAFE {
-    const EData orig = owp[VL_BITWORD_E(bit)];
-    owp[VL_BITWORD_E(bit)] = (orig | (VL_EUL(1) << VL_BITBIT_E(bit)));
-}
-
-//===================================================================
-// SYSTEMC OPERATORS
-// Copying verilog format to systemc integers and bit vectors.
-// Get a SystemC variable
-
-#define VL_ASSIGN_ISI(obits, vvar, svar) \
-    { (vvar) = VL_CLEAN_II((obits), (obits), (svar).read()); }
-#define VL_ASSIGN_QSQ(obits, vvar, svar) \
-    { (vvar) = VL_CLEAN_QQ((obits), (obits), (svar).read()); }
-
-#define VL_ASSIGN_ISW(obits, od, svar) \
-    { (od) = ((svar).read().get_word(0)) & VL_MASK_I(obits); }
-#define VL_ASSIGN_QSW(obits, od, svar) \
-    { \
-        (od) = ((static_cast<QData>((svar).read().get_word(1))) << VL_IDATASIZE \
-                | (svar).read().get_word(0)) \
-               & VL_MASK_Q(obits); \
-    }
-#define VL_ASSIGN_WSW(obits, owp, svar) \
-    { \
-        const int words = VL_WORDS_I(obits); \
-        for (int i = 0; i < words; ++i) (owp)[i] = (svar).read().get_word(i); \
-        (owp)[words - 1] &= VL_MASK_E(obits); \
-    }
-
-#define VL_ASSIGN_ISU(obits, vvar, svar) \
-    { (vvar) = VL_CLEAN_II((obits), (obits), (svar).read().to_uint()); }
-#define VL_ASSIGN_QSU(obits, vvar, svar) \
-    { (vvar) = VL_CLEAN_QQ((obits), (obits), (svar).read().to_uint64()); }
-#define VL_ASSIGN_WSB(obits, owp, svar) \
-    { \
-        const int words = VL_WORDS_I(obits); \
-        sc_biguint<(obits)> _butemp = (svar).read(); \
-        for (int i = 0; i < words; ++i) { \
-            int msb = ((i + 1) * VL_IDATASIZE) - 1; \
-            msb = (msb >= (obits)) ? ((obits)-1) : msb; \
-            (owp)[i] = _butemp.range(msb, i * VL_IDATASIZE).to_uint(); \
-        } \
-        (owp)[words - 1] &= VL_MASK_E(obits); \
-    }
-
-// Copying verilog format from systemc integers and bit vectors.
-// Set a SystemC variable
-
-#define VL_ASSIGN_SII(obits, svar, vvar) \
-    { (svar).write(vvar); }
-#define VL_ASSIGN_SQQ(obits, svar, vvar) \
-    { (svar).write(vvar); }
-
-#define VL_ASSIGN_SWI(obits, svar, rd) \
-    { \
-        sc_bv<(obits)> _bvtemp; \
-        _bvtemp.set_word(0, (rd)); \
-        (svar).write(_bvtemp); \
-    }
-#define VL_ASSIGN_SWQ(obits, svar, rd) \
-    { \
-        sc_bv<(obits)> _bvtemp; \
-        _bvtemp.set_word(0, static_cast<IData>(rd)); \
-        _bvtemp.set_word(1, static_cast<IData>((rd) >> VL_IDATASIZE)); \
-        (svar).write(_bvtemp); \
-    }
-#define VL_ASSIGN_SWW(obits, svar, rwp) \
-    { \
-        sc_bv<(obits)> _bvtemp; \
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) _bvtemp.set_word(i, (rwp)[i]); \
-        (svar).write(_bvtemp); \
-    }
-
-#define VL_ASSIGN_SUI(obits, svar, rd) \
-    { (svar).write(rd); }
-#define VL_ASSIGN_SUQ(obits, svar, rd) \
-    { (svar).write(rd); }
-#define VL_ASSIGN_SBI(obits, svar, rd) \
-    { (svar).write(rd); }
-#define VL_ASSIGN_SBQ(obits, svar, rd) \
-    { (svar).write(rd); }
-#define VL_ASSIGN_SBW(obits, svar, rwp) \
-    { \
-        sc_biguint<(obits)> _butemp; \
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) { \
-            int msb = ((i + 1) * VL_IDATASIZE) - 1; \
-            msb = (msb >= (obits)) ? ((obits)-1) : msb; \
-            _butemp.range(msb, i* VL_IDATASIZE) = (rwp)[i]; \
-        } \
-        (svar).write(_butemp); \
-    }
-
-//===================================================================
-// Extending sizes
-
-// CAREFUL, we're width changing, so obits!=lbits
-
-// Right must be clean because otherwise size increase would pick up bad bits
-// EMIT_RULE: VL_EXTEND:  oclean=clean; rclean==clean;
-#define VL_EXTEND_II(obits, lbits, lhs) ((lhs))
-#define VL_EXTEND_QI(obits, lbits, lhs) (static_cast<QData>(lhs))
-#define VL_EXTEND_QQ(obits, lbits, lhs) ((lhs))
-
-static inline WDataOutP VL_EXTEND_WI(int obits, int, WDataOutP owp, IData ld) VL_MT_SAFE {
-    // Note for extracts that obits != lbits
-    owp[0] = ld;
-    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    return owp;
-}
-static inline WDataOutP VL_EXTEND_WQ(int obits, int, WDataOutP owp, QData ld) VL_MT_SAFE {
-    VL_SET_WQ(owp, ld);
-    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    return owp;
-}
-static inline WDataOutP VL_EXTEND_WW(int obits, int lbits, WDataOutP owp,
-                                     WDataInP const lwp) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(lbits); ++i) owp[i] = lwp[i];
-    for (int i = VL_WORDS_I(lbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    return owp;
-}
-
-// EMIT_RULE: VL_EXTENDS:  oclean=*dirty*; obits=lbits;
-// Sign extension; output dirty
-static inline IData VL_EXTENDS_II(int, int lbits, IData lhs) VL_PURE {
-    return VL_EXTENDSIGN_I(lbits, lhs) | lhs;
-}
-static inline QData VL_EXTENDS_QI(int, int lbits, QData lhs /*Q_as_need_extended*/) VL_PURE {
-    return VL_EXTENDSIGN_Q(lbits, lhs) | lhs;
-}
-static inline QData VL_EXTENDS_QQ(int, int lbits, QData lhs) VL_PURE {
-    return VL_EXTENDSIGN_Q(lbits, lhs) | lhs;
-}
-
-static inline WDataOutP VL_EXTENDS_WI(int obits, int lbits, WDataOutP owp, IData ld) VL_MT_SAFE {
-    const EData sign = VL_SIGNONES_E(lbits, static_cast<EData>(ld));
-    owp[0] = ld | (sign & ~VL_MASK_E(lbits));
-    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
-    return owp;
-}
-static inline WDataOutP VL_EXTENDS_WQ(int obits, int lbits, WDataOutP owp, QData ld) VL_MT_SAFE {
-    VL_SET_WQ(owp, ld);
-    const EData sign = VL_SIGNONES_E(lbits, owp[1]);
-    owp[1] |= sign & ~VL_MASK_E(lbits);
-    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
-    return owp;
-}
-static inline WDataOutP VL_EXTENDS_WW(int obits, int lbits, WDataOutP owp,
-                                      WDataInP const lwp) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(lbits) - 1; ++i) owp[i] = lwp[i];
-    const int lmsw = VL_WORDS_I(lbits) - 1;
-    const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
-    owp[lmsw] = lwp[lmsw] | (sign & ~VL_MASK_E(lbits));
-    for (int i = VL_WORDS_I(lbits); i < VL_WORDS_I(obits); ++i) owp[i] = sign;
-    return owp;
-}
-
-//===================================================================
-// REDUCTION OPERATORS
-
-// EMIT_RULE: VL_REDAND:  oclean=clean; lclean==clean; obits=1;
-#define VL_REDAND_II(obits, lbits, lhs) ((lhs) == VL_MASK_I(lbits))
-#define VL_REDAND_IQ(obits, lbits, lhs) ((lhs) == VL_MASK_Q(lbits))
-static inline IData VL_REDAND_IW(int, int lbits, WDataInP const lwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(lbits);
-    EData combine = lwp[0];
-    for (int i = 1; i < words - 1; ++i) combine &= lwp[i];
-    combine &= ~VL_MASK_E(lbits) | lwp[words - 1];
-    return ((~combine) == 0);
-}
-
-// EMIT_RULE: VL_REDOR:  oclean=clean; lclean==clean; obits=1;
-#define VL_REDOR_I(lhs) ((lhs) != 0)
-#define VL_REDOR_Q(lhs) ((lhs) != 0)
-static inline IData VL_REDOR_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    EData equal = 0;
-    for (int i = 0; i < words; ++i) equal |= lwp[i];
-    return (equal != 0);
-}
-
-// EMIT_RULE: VL_REDXOR:  oclean=dirty; obits=1;
-static inline IData VL_REDXOR_2(IData r) VL_PURE {
-    // Experiments show VL_REDXOR_2 is faster than __builtin_parityl
-    r = (r ^ (r >> 1));
-    return r;
-}
-static inline IData VL_REDXOR_4(IData r) VL_PURE {
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
-    return __builtin_parityl(r);
-#else
-    r = (r ^ (r >> 1));
-    r = (r ^ (r >> 2));
-    return r;
-#endif
-}
-static inline IData VL_REDXOR_8(IData r) VL_PURE {
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
-    return __builtin_parityl(r);
-#else
-    r = (r ^ (r >> 1));
-    r = (r ^ (r >> 2));
-    r = (r ^ (r >> 4));
-    return r;
-#endif
-}
-static inline IData VL_REDXOR_16(IData r) VL_PURE {
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
-    return __builtin_parityl(r);
-#else
-    r = (r ^ (r >> 1));
-    r = (r ^ (r >> 2));
-    r = (r ^ (r >> 4));
-    r = (r ^ (r >> 8));
-    return r;
-#endif
-}
-static inline IData VL_REDXOR_32(IData r) VL_PURE {
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
-    return __builtin_parityl(r);
-#else
-    r = (r ^ (r >> 1));
-    r = (r ^ (r >> 2));
-    r = (r ^ (r >> 4));
-    r = (r ^ (r >> 8));
-    r = (r ^ (r >> 16));
-    return r;
-#endif
-}
-static inline IData VL_REDXOR_64(QData r) VL_PURE {
-#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
-    return __builtin_parityll(r);
-#else
-    r = (r ^ (r >> 1));
-    r = (r ^ (r >> 2));
-    r = (r ^ (r >> 4));
-    r = (r ^ (r >> 8));
-    r = (r ^ (r >> 16));
-    r = (r ^ (r >> 32));
-    return static_cast<IData>(r);
-#endif
-}
-static inline IData VL_REDXOR_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    EData r = lwp[0];
-    for (int i = 1; i < words; ++i) r ^= lwp[i];
-    return VL_REDXOR_32(r);
-}
-
-// EMIT_RULE: VL_COUNTONES_II:  oclean = false; lhs clean
-static inline IData VL_COUNTONES_I(IData lhs) VL_PURE {
-    // This is faster than __builtin_popcountl
-    IData r = lhs - ((lhs >> 1) & 033333333333) - ((lhs >> 2) & 011111111111);
-    r = (r + (r >> 3)) & 030707070707;
-    r = (r + (r >> 6));
-    r = (r + (r >> 12) + (r >> 24)) & 077;
-    return r;
-}
-static inline IData VL_COUNTONES_Q(QData lhs) VL_PURE {
-    return VL_COUNTONES_I(static_cast<IData>(lhs)) + VL_COUNTONES_I(static_cast<IData>(lhs >> 32));
-}
-#define VL_COUNTONES_E VL_COUNTONES_I
-static inline IData VL_COUNTONES_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    EData r = 0;
-    for (int i = 0; i < words; ++i) r += VL_COUNTONES_E(lwp[i]);
-    return r;
-}
-
-// EMIT_RULE: VL_COUNTBITS_II:  oclean = false; lhs clean
-static inline IData VL_COUNTBITS_I(int lbits, IData lhs, IData ctrl0, IData ctrl1,
-                                   IData ctrl2) VL_PURE {
-    int ctrlSum = (ctrl0 & 0x1) + (ctrl1 & 0x1) + (ctrl2 & 0x1);
-    if (ctrlSum == 3) {
-        return VL_COUNTONES_I(lhs);
-    } else if (ctrlSum == 0) {
-        IData mask = (lbits == 32) ? -1 : ((1 << lbits) - 1);
-        return VL_COUNTONES_I(~lhs & mask);
-    } else {
-        return (lbits == 32) ? 32 : lbits;
-    }
-}
-static inline IData VL_COUNTBITS_Q(int lbits, QData lhs, IData ctrl0, IData ctrl1,
-                                   IData ctrl2) VL_PURE {
-    return VL_COUNTBITS_I(32, static_cast<IData>(lhs), ctrl0, ctrl1, ctrl2)
-           + VL_COUNTBITS_I(lbits - 32, static_cast<IData>(lhs >> 32), ctrl0, ctrl1, ctrl2);
-}
-#define VL_COUNTBITS_E VL_COUNTBITS_I
-static inline IData VL_COUNTBITS_W(int lbits, int words, WDataInP const lwp, IData ctrl0,
-                                   IData ctrl1, IData ctrl2) VL_MT_SAFE {
-    EData r = 0;
-    IData wordLbits = 32;
-    for (int i = 0; i < words; ++i) {
-        if (i == words - 1) wordLbits = lbits % 32;
-        r += VL_COUNTBITS_E(wordLbits, lwp[i], ctrl0, ctrl1, ctrl2);
-    }
-    return r;
-}
-
-static inline IData VL_ONEHOT_I(IData lhs) VL_PURE {
-    return (((lhs & (lhs - 1)) == 0) & (lhs != 0));
-}
-static inline IData VL_ONEHOT_Q(QData lhs) VL_PURE {
-    return (((lhs & (lhs - 1)) == 0) & (lhs != 0));
-}
-static inline IData VL_ONEHOT_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    EData one = 0;
-    for (int i = 0; (i < words); ++i) {
-        if (lwp[i]) {
-            if (one) return 0;
-            one = 1;
-            if (lwp[i] & (lwp[i] - 1)) return 0;
-        }
-    }
-    return one;
-}
-
-static inline IData VL_ONEHOT0_I(IData lhs) VL_PURE { return ((lhs & (lhs - 1)) == 0); }
-static inline IData VL_ONEHOT0_Q(QData lhs) VL_PURE { return ((lhs & (lhs - 1)) == 0); }
-static inline IData VL_ONEHOT0_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    bool one = false;
-    for (int i = 0; (i < words); ++i) {
-        if (lwp[i]) {
-            if (one) return 0;
-            one = true;
-            if (lwp[i] & (lwp[i] - 1)) return 0;
-        }
-    }
-    return 1;
-}
-
-static inline IData VL_CLOG2_I(IData lhs) VL_PURE {
-    // There are faster algorithms, or fls GCC4 builtins, but rarely used
-    if (VL_UNLIKELY(!lhs)) return 0;
-    --lhs;
-    int shifts = 0;
-    for (; lhs != 0; ++shifts) lhs = lhs >> 1;
-    return shifts;
-}
-static inline IData VL_CLOG2_Q(QData lhs) VL_PURE {
-    if (VL_UNLIKELY(!lhs)) return 0;
-    --lhs;
-    int shifts = 0;
-    for (; lhs != 0; ++shifts) lhs = lhs >> 1ULL;
-    return shifts;
-}
-static inline IData VL_CLOG2_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    EData adjust = (VL_COUNTONES_W(words, lwp) == 1) ? 0 : 1;
-    for (int i = words - 1; i >= 0; --i) {
-        if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
-            for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
-                if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) {
-                    return i * VL_EDATASIZE + bit + adjust;
-                }
-            }
-            // Can't get here - one bit must be set
-        }
-    }
-    return 0;
-}
-
-static inline IData VL_MOSTSETBITP1_W(int words, WDataInP const lwp) VL_MT_SAFE {
-    // MSB set bit plus one; similar to FLS.  0=value is zero
-    for (int i = words - 1; i >= 0; --i) {
-        if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
-            for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
-                if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) return i * VL_EDATASIZE + bit + 1;
-            }
-            // Can't get here - one bit must be set
-        }
-    }
-    return 0;
-}
-
-//===================================================================
-// SIMPLE LOGICAL OPERATORS
-
-// EMIT_RULE: VL_AND:  oclean=lclean||rclean; obits=lbits; lbits==rbits;
-static inline WDataOutP VL_AND_W(int words, WDataOutP owp, WDataInP const lwp,
-                                 WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] & rwp[i]);
-    return owp;
-}
-// EMIT_RULE: VL_OR:   oclean=lclean&&rclean; obits=lbits; lbits==rbits;
-static inline WDataOutP VL_OR_W(int words, WDataOutP owp, WDataInP const lwp,
-                                WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] | rwp[i]);
-    return owp;
-}
-// EMIT_RULE: VL_CHANGEXOR:  oclean=1; obits=32; lbits==rbits;
-static inline IData VL_CHANGEXOR_W(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    IData od = 0;
-    for (int i = 0; (i < words); ++i) od |= (lwp[i] ^ rwp[i]);
-    return od;
-}
-// EMIT_RULE: VL_XOR:  oclean=lclean&&rclean; obits=lbits; lbits==rbits;
-static inline WDataOutP VL_XOR_W(int words, WDataOutP owp, WDataInP const lwp,
-                                 WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] ^ rwp[i]);
-    return owp;
-}
-// EMIT_RULE: VL_NOT:  oclean=dirty; obits=lbits;
-static inline WDataOutP VL_NOT_W(int words, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
-    for (int i = 0; i < words; ++i) owp[i] = ~(lwp[i]);
-    return owp;
-}
-
-//=========================================================================
-// Logical comparisons
-
-// EMIT_RULE: VL_EQ:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-// EMIT_RULE: VL_NEQ: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-// EMIT_RULE: VL_LT:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-// EMIT_RULE: VL_GT:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-// EMIT_RULE: VL_GTE: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-// EMIT_RULE: VL_LTE: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
-#define VL_NEQ_W(words, lwp, rwp) (!VL_EQ_W(words, lwp, rwp))
-#define VL_LT_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) < 0)
-#define VL_LTE_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) <= 0)
-#define VL_GT_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) > 0)
-#define VL_GTE_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) >= 0)
-
-// Output clean, <lhs> AND <rhs> MUST BE CLEAN
-static inline IData VL_EQ_W(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    EData nequal = 0;
-    for (int i = 0; (i < words); ++i) nequal |= (lwp[i] ^ rwp[i]);
-    return (nequal == 0);
-}
-
-// Internal usage
-static inline int _vl_cmp_w(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    for (int i = words - 1; i >= 0; --i) {
-        if (lwp[i] > rwp[i]) return 1;
-        if (lwp[i] < rwp[i]) return -1;
-    }
-    return 0;  // ==
-}
-
-#define VL_LTS_IWW(obits, lbits, rbbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) < 0)
-#define VL_LTES_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) <= 0)
-#define VL_GTS_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) > 0)
-#define VL_GTES_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) >= 0)
-
-static inline IData VL_GTS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    // For lbits==32, this becomes just a single instruction, otherwise ~5.
-    // GCC 3.3.4 sign extension bugs on AMD64 architecture force us to use quad logic
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
-    return lhs_signed > rhs_signed;
-}
-static inline IData VL_GTS_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
-    return lhs_signed > rhs_signed;
-}
-
-static inline IData VL_GTES_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
-    return lhs_signed >= rhs_signed;
-}
-static inline IData VL_GTES_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
-    return lhs_signed >= rhs_signed;
-}
-
-static inline IData VL_LTS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
-    return lhs_signed < rhs_signed;
-}
-static inline IData VL_LTS_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
-    return lhs_signed < rhs_signed;
-}
-
-static inline IData VL_LTES_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
-    return lhs_signed <= rhs_signed;
-}
-static inline IData VL_LTES_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
-    return lhs_signed <= rhs_signed;
-}
-
-static inline int _vl_cmps_w(int lbits, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(lbits);
-    int i = words - 1;
-    // We need to flip sense if negative comparison
-    const EData lsign = VL_SIGN_E(lbits, lwp[i]);
-    const EData rsign = VL_SIGN_E(lbits, rwp[i]);
-    if (!lsign && rsign) return 1;  // + > -
-    if (lsign && !rsign) return -1;  // - < +
-    for (; i >= 0; --i) {
-        if (lwp[i] > rwp[i]) return 1;
-        if (lwp[i] < rwp[i]) return -1;
-    }
-    return 0;  // ==
-}
-
-//=========================================================================
-// Math
-
-// Output NOT clean
-static inline WDataOutP VL_NEGATE_W(int words, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
-    EData carry = 1;
-    for (int i = 0; i < words; ++i) {
-        owp[i] = ~lwp[i] + carry;
-        carry = (owp[i] < ~lwp[i]);
-    }
-    return owp;
-}
-static inline void VL_NEGATE_INPLACE_W(int words, WDataOutP owp_lwp) VL_MT_SAFE {
-    EData carry = 1;
-    for (int i = 0; i < words; ++i) {
-        EData word = ~owp_lwp[i] + carry;
-        carry = (word < ~owp_lwp[i]);
-        owp_lwp[i] = word;
-    }
-}
-
-// EMIT_RULE: VL_MUL:    oclean=dirty; lclean==clean; rclean==clean;
-// EMIT_RULE: VL_DIV:    oclean=dirty; lclean==clean; rclean==clean;
-// EMIT_RULE: VL_MODDIV: oclean=dirty; lclean==clean; rclean==clean;
-#define VL_DIV_III(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) / (rhs))
-#define VL_DIV_QQQ(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) / (rhs))
-#define VL_DIV_WWW(lbits, owp, lwp, rwp) (_vl_moddiv_w(lbits, owp, lwp, rwp, 0))
-#define VL_MODDIV_III(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) % (rhs))
-#define VL_MODDIV_QQQ(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) % (rhs))
-#define VL_MODDIV_WWW(lbits, owp, lwp, rwp) (_vl_moddiv_w(lbits, owp, lwp, rwp, 1))
-
-static inline WDataOutP VL_ADD_W(int words, WDataOutP owp, WDataInP const lwp,
-                                 WDataInP const rwp) VL_MT_SAFE {
-    QData carry = 0;
-    for (int i = 0; i < words; ++i) {
-        carry = carry + static_cast<QData>(lwp[i]) + static_cast<QData>(rwp[i]);
-        owp[i] = (carry & 0xffffffffULL);
-        carry = (carry >> 32ULL) & 0xffffffffULL;
-    }
-    // Last output word is dirty
-    return owp;
-}
-
-static inline WDataOutP VL_SUB_W(int words, WDataOutP owp, WDataInP const lwp,
-                                 WDataInP const rwp) VL_MT_SAFE {
-    QData carry = 0;
-    for (int i = 0; i < words; ++i) {
-        carry = (carry + static_cast<QData>(lwp[i])
-                 + static_cast<QData>(static_cast<IData>(~rwp[i])));
-        if (i == 0) ++carry;  // Negation of rwp
-        owp[i] = (carry & 0xffffffffULL);
-        carry = (carry >> 32ULL) & 0xffffffffULL;
-    }
-    // Last output word is dirty
-    return owp;
-}
-
-static inline WDataOutP VL_MUL_W(int words, WDataOutP owp, WDataInP const lwp,
-                                 WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; i < words; ++i) owp[i] = 0;
-    for (int lword = 0; lword < words; ++lword) {
-        for (int rword = 0; rword < words; ++rword) {
-            QData mul = static_cast<QData>(lwp[lword]) * static_cast<QData>(rwp[rword]);
-            for (int qword = lword + rword; qword < words; ++qword) {
-                mul += static_cast<QData>(owp[qword]);
-                owp[qword] = (mul & 0xffffffffULL);
-                mul = (mul >> 32ULL) & 0xffffffffULL;
-            }
-        }
-    }
-    // Last output word is dirty
-    return owp;
-}
-
-static inline IData VL_MULS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    const vlsint32_t lhs_signed = VL_EXTENDS_II(32, lbits, lhs);
-    const vlsint32_t rhs_signed = VL_EXTENDS_II(32, lbits, rhs);
-    return lhs_signed * rhs_signed;
-}
-static inline QData VL_MULS_QQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
-    return lhs_signed * rhs_signed;
-}
-
-static inline WDataOutP VL_MULS_WWW(int, int lbits, int, WDataOutP owp, WDataInP const lwp,
-                                    WDataInP const rwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(lbits);
-    // cppcheck-suppress variableScope
-    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
-    // cppcheck-suppress variableScope
-    WData rwstore[VL_MULS_MAX_WORDS];
-    WDataInP lwusp = lwp;
-    WDataInP rwusp = rwp;
-    EData lneg = VL_SIGN_E(lbits, lwp[words - 1]);
-    if (lneg) {  // Negate lhs
-        lwusp = lwstore;
-        VL_NEGATE_W(words, lwstore, lwp);
-        lwstore[words - 1] &= VL_MASK_E(lbits);  // Clean it
-    }
-    EData rneg = VL_SIGN_E(lbits, rwp[words - 1]);
-    if (rneg) {  // Negate rhs
-        rwusp = rwstore;
-        VL_NEGATE_W(words, rwstore, rwp);
-        rwstore[words - 1] &= VL_MASK_E(lbits);  // Clean it
-    }
-    VL_MUL_W(words, owp, lwusp, rwusp);
-    owp[words - 1] &= VL_MASK_E(
-        lbits);  // Clean.  Note it's ok for the multiply to overflow into the sign bit
-    if ((lneg ^ rneg) & 1) {  // Negate output (not using NEGATE, as owp==lwp)
-        QData carry = 0;
-        for (int i = 0; i < words; ++i) {
-            carry = carry + static_cast<QData>(static_cast<IData>(~owp[i]));
-            if (i == 0) ++carry;  // Negation of temp2
-            owp[i] = (carry & 0xffffffffULL);
-            carry = (carry >> 32ULL) & 0xffffffffULL;
-        }
-        // Not needed: owp[words-1] |= 1<<VL_BITBIT_E(lbits-1);  // Set sign bit
-    }
-    // Last output word is dirty
-    return owp;
-}
-
-static inline IData VL_DIVS_III(int lbits, IData lhs, IData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 0;
-    // -MAX / -1 cannot be represented in twos complement, and will cause SIGFPE
-    if (VL_UNLIKELY(lhs == 0x80000000 && rhs == 0xffffffff)) return 0;
-    const vlsint32_t lhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, lhs);
-    const vlsint32_t rhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, rhs);
-    return lhs_signed / rhs_signed;
-}
-static inline QData VL_DIVS_QQQ(int lbits, QData lhs, QData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 0;
-    // -MAX / -1 cannot be represented in twos complement, and will cause SIGFPE
-    if (VL_UNLIKELY(lhs == 0x8000000000000000ULL && rhs == 0xffffffffffffffffULL)) return 0;
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, rhs);
-    return lhs_signed / rhs_signed;
-}
-static inline IData VL_MODDIVS_III(int lbits, IData lhs, IData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 0;
-    if (VL_UNLIKELY(lhs == 0x80000000 && rhs == 0xffffffff)) return 0;
-    const vlsint32_t lhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, lhs);
-    const vlsint32_t rhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, rhs);
-    return lhs_signed % rhs_signed;
-}
-static inline QData VL_MODDIVS_QQQ(int lbits, QData lhs, QData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 0;
-    if (VL_UNLIKELY(lhs == 0x8000000000000000ULL && rhs == 0xffffffffffffffffULL)) return 0;
-    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, lhs);
-    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, rhs);
-    return lhs_signed % rhs_signed;
-}
-
-static inline WDataOutP VL_DIVS_WWW(int lbits, WDataOutP owp, WDataInP const lwp,
-                                    WDataInP const rwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(lbits);
-    const EData lsign = VL_SIGN_E(lbits, lwp[words - 1]);
-    const EData rsign = VL_SIGN_E(lbits, rwp[words - 1]);
-    // cppcheck-suppress variableScope
-    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
-    // cppcheck-suppress variableScope
-    WData rwstore[VL_MULS_MAX_WORDS];
-    WDataInP ltup = lwp;
-    WDataInP rtup = rwp;
-    if (lsign) ltup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), lwstore, lwp));
-    if (rsign) rtup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), rwstore, rwp));
-    if ((lsign && !rsign) || (!lsign && rsign)) {
-        WData qNoSign[VL_MULS_MAX_WORDS];
-        VL_DIV_WWW(lbits, qNoSign, ltup, rtup);
-        _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), owp, qNoSign));
-        return owp;
-    } else {
-        return VL_DIV_WWW(lbits, owp, ltup, rtup);
-    }
-}
-static inline WDataOutP VL_MODDIVS_WWW(int lbits, WDataOutP owp, WDataInP const lwp,
-                                       WDataInP const rwp) VL_MT_SAFE {
-    const int words = VL_WORDS_I(lbits);
-    const EData lsign = VL_SIGN_E(lbits, lwp[words - 1]);
-    const EData rsign = VL_SIGN_E(lbits, rwp[words - 1]);
-    // cppcheck-suppress variableScope
-    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
-    // cppcheck-suppress variableScope
-    WData rwstore[VL_MULS_MAX_WORDS];
-    WDataInP ltup = lwp;
-    WDataInP rtup = rwp;
-    if (lsign) ltup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), lwstore, lwp));
-    if (rsign) rtup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), rwstore, rwp));
-    if (lsign) {  // Only dividend sign matters for modulus
-        WData qNoSign[VL_MULS_MAX_WORDS];
-        VL_MODDIV_WWW(lbits, qNoSign, ltup, rtup);
-        _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), owp, qNoSign));
-        return owp;
-    } else {
-        return VL_MODDIV_WWW(lbits, owp, ltup, rtup);
-    }
-}
-
-#define VL_POW_IIQ(obits, lbits, rbits, lhs, rhs) VL_POW_QQQ(obits, lbits, rbits, lhs, rhs)
-#define VL_POW_IIW(obits, lbits, rbits, lhs, rwp) VL_POW_QQW(obits, lbits, rbits, lhs, rwp)
-#define VL_POW_QQI(obits, lbits, rbits, lhs, rhs) VL_POW_QQQ(obits, lbits, rbits, lhs, rhs)
-#define VL_POW_WWI(obits, lbits, rbits, owp, lwp, rhs) \
-    VL_POW_WWQ(obits, lbits, rbits, owp, lwp, rhs)
-
-static inline IData VL_POW_III(int, int, int rbits, IData lhs, IData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 1;
-    if (VL_UNLIKELY(lhs == 0)) return 0;
-    IData power = lhs;
-    IData out = 1;
-    for (int i = 0; i < rbits; ++i) {
-        if (i > 0) power = power * power;
-        if (rhs & (1ULL << i)) out *= power;
-    }
-    return out;
-}
-static inline QData VL_POW_QQQ(int, int, int rbits, QData lhs, QData rhs) VL_PURE {
-    if (VL_UNLIKELY(rhs == 0)) return 1;
-    if (VL_UNLIKELY(lhs == 0)) return 0;
-    QData power = lhs;
-    QData out = 1ULL;
-    for (int i = 0; i < rbits; ++i) {
-        if (i > 0) power = power * power;
-        if (rhs & (1ULL << i)) out *= power;
-    }
-    return out;
-}
-WDataOutP VL_POW_WWW(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp,
-                     WDataInP const rwp);
-WDataOutP VL_POW_WWQ(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp, QData rhs);
-QData VL_POW_QQW(int obits, int, int rbits, QData lhs, WDataInP const rwp);
-
-#define VL_POWSS_IIQ(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
-    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
-#define VL_POWSS_IIQ(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
-    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
-#define VL_POWSS_IIW(obits, lbits, rbits, lhs, rwp, lsign, rsign) \
-    VL_POWSS_QQW(obits, lbits, rbits, lhs, rwp, lsign, rsign)
-#define VL_POWSS_QQI(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
-    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
-#define VL_POWSS_WWI(obits, lbits, rbits, owp, lwp, rhs, lsign, rsign) \
-    VL_POWSS_WWQ(obits, lbits, rbits, owp, lwp, rhs, lsign, rsign)
-
-static inline IData VL_POWSS_III(int obits, int, int rbits, IData lhs, IData rhs, bool lsign,
-                                 bool rsign) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs == 0)) return 1;
-    if (rsign && VL_SIGN_I(rbits, rhs)) {
-        if (lhs == 0) {
-            return 0;  // "X"
-        } else if (lhs == 1) {
-            return 1;
-        } else if (lsign && lhs == VL_MASK_I(obits)) {  // -1
-            if (rhs & 1) {
-                return VL_MASK_I(obits);  // -1^odd=-1
-            } else {
-                return 1;  // -1^even=1
-            }
-        }
-        return 0;
-    }
-    return VL_POW_III(obits, rbits, rbits, lhs, rhs);
-}
-static inline QData VL_POWSS_QQQ(int obits, int, int rbits, QData lhs, QData rhs, bool lsign,
-                                 bool rsign) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs == 0)) return 1;
-    if (rsign && VL_SIGN_Q(rbits, rhs)) {
-        if (lhs == 0) {
-            return 0;  // "X"
-        } else if (lhs == 1) {
-            return 1;
-        } else if (lsign && lhs == VL_MASK_Q(obits)) {  // -1
-            if (rhs & 1) {
-                return VL_MASK_Q(obits);  // -1^odd=-1
-            } else {
-                return 1;  // -1^even=1
-            }
-        }
-        return 0;
-    }
-    return VL_POW_QQQ(obits, rbits, rbits, lhs, rhs);
-}
-WDataOutP VL_POWSS_WWW(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp,
-                       WDataInP const rwp, bool lsign, bool rsign);
-WDataOutP VL_POWSS_WWQ(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp, QData rhs,
-                       bool lsign, bool rsign);
-QData VL_POWSS_QQW(int obits, int, int rbits, QData lhs, WDataInP const rwp, bool lsign,
-                   bool rsign);
-
-//===================================================================
-// Concat/replication
-
-// INTERNAL: Stuff LHS bit 0++ into OUTPUT at specified offset
-// ld may be "dirty", output is clean
-static inline void _vl_insert_II(int, CData& lhsr, IData ld, int hbit, int lbit,
-                                 int rbits) VL_PURE {
-    const IData cleanmask = VL_MASK_I(rbits);
-    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
-    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
-}
-static inline void _vl_insert_II(int, SData& lhsr, IData ld, int hbit, int lbit,
-                                 int rbits) VL_PURE {
-    const IData cleanmask = VL_MASK_I(rbits);
-    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
-    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
-}
-static inline void _vl_insert_II(int, IData& lhsr, IData ld, int hbit, int lbit,
-                                 int rbits) VL_PURE {
-    const IData cleanmask = VL_MASK_I(rbits);
-    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
-    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
-}
-static inline void _vl_insert_QQ(int, QData& lhsr, QData ld, int hbit, int lbit,
-                                 int rbits) VL_PURE {
-    const QData cleanmask = VL_MASK_Q(rbits);
-    const QData insmask = (VL_MASK_Q(hbit - lbit + 1)) << lbit;
-    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
-}
-static inline void _vl_insert_WI(int, WDataOutP owp, IData ld, int hbit, int lbit,
-                                 int rbits = 0) VL_MT_SAFE {
-    const int hoffset = VL_BITBIT_E(hbit);
-    const int loffset = VL_BITBIT_E(lbit);
-    const int roffset = VL_BITBIT_E(rbits);
-    const int hword = VL_BITWORD_E(hbit);
-    const int lword = VL_BITWORD_E(lbit);
-    const int rword = VL_BITWORD_E(rbits);
-    const EData cleanmask = hword == rword ? VL_MASK_E(roffset) : VL_MASK_E(0);
-
-    if (hoffset == VL_SIZEBITS_E && loffset == 0) {
-        // Fast and common case, word based insertion
-        owp[VL_BITWORD_E(lbit)] = ld & cleanmask;
-    } else {
-        const EData lde = static_cast<EData>(ld);
-        if (hword == lword) {  // know < EData bits because above checks it
-            // Assignment is contained within one word of destination
-            const EData insmask = (VL_MASK_E(hoffset - loffset + 1)) << loffset;
-            owp[lword] = (owp[lword] & ~insmask) | ((lde << loffset) & (insmask & cleanmask));
-        } else {
-            // Assignment crosses a word boundary in destination
-            const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1)) << 0;
-            const EData linsmask = (VL_MASK_E((VL_EDATASIZE - 1) - loffset + 1)) << loffset;
-            const int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword
-            owp[lword] = (owp[lword] & ~linsmask) | ((lde << loffset) & linsmask);
-            owp[hword]
-                = (owp[hword] & ~hinsmask) | ((lde >> nbitsonright) & (hinsmask & cleanmask));
-        }
-    }
-}
-
-// INTERNAL: Stuff large LHS bit 0++ into OUTPUT at specified offset
-// lwp may be "dirty"
-static inline void _vl_insert_WW(int, WDataOutP owp, WDataInP const lwp, int hbit, int lbit,
-                                 int rbits = 0) VL_MT_SAFE {
-    const int hoffset = VL_BITBIT_E(hbit);
-    const int loffset = VL_BITBIT_E(lbit);
-    const int roffset = VL_BITBIT_E(rbits);
-    const int lword = VL_BITWORD_E(lbit);
-    const int hword = VL_BITWORD_E(hbit);
-    const int rword = VL_BITWORD_E(rbits);
-    const int words = VL_WORDS_I(hbit - lbit + 1);
-    // Cleaning mask, only applied to top word of the assignment.  Is a no-op
-    // if we don't assign to the top word of the destination.
-    const EData cleanmask = hword == rword ? VL_MASK_E(roffset) : VL_MASK_E(0);
-
-    if (hoffset == VL_SIZEBITS_E && loffset == 0) {
-        // Fast and common case, word based insertion
-        for (int i = 0; i < (words - 1); ++i) owp[lword + i] = lwp[i];
-        owp[hword] = lwp[words - 1] & cleanmask;
-    } else if (loffset == 0) {
-        // Non-32bit, but nicely aligned, so stuff all but the last word
-        for (int i = 0; i < (words - 1); ++i) owp[lword + i] = lwp[i];
-        // Know it's not a full word as above fast case handled it
-        const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1));
-        owp[hword] = (owp[hword] & ~hinsmask) | (lwp[words - 1] & (hinsmask & cleanmask));
-    } else {
-        const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1)) << 0;
-        const EData linsmask = (VL_MASK_E((VL_EDATASIZE - 1) - loffset + 1)) << loffset;
-        const int nbitsonright
-            = VL_EDATASIZE - loffset;  // bits that end up in lword (know loffset!=0)
-        // Middle words
-        for (int i = 0; i < words; ++i) {
-            {  // Lower word
-                const int oword = lword + i;
-                const EData d = lwp[i] << loffset;
-                const EData od = (owp[oword] & ~linsmask) | (d & linsmask);
-                if (oword == hword) {
-                    owp[oword] = (owp[oword] & ~hinsmask) | (od & (hinsmask & cleanmask));
-                } else {
-                    owp[oword] = od;
-                }
-            }
-            {  // Upper word
-                const int oword = lword + i + 1;
-                if (oword <= hword) {
-                    const EData d = lwp[i] >> nbitsonright;
-                    const EData od = (d & ~linsmask) | (owp[oword] & linsmask);
-                    if (oword == hword) {
-                        owp[oword] = (owp[oword] & ~hinsmask) | (od & (hinsmask & cleanmask));
-                    } else {
-                        owp[oword] = od;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static inline void _vl_insert_WQ(int obits, WDataOutP owp, QData ld, int hbit, int lbit,
-                                 int rbits = 0) VL_MT_SAFE {
-    WData lwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(lwp, ld);
-    _vl_insert_WW(obits, owp, lwp, hbit, lbit, rbits);
-}
-
-// EMIT_RULE: VL_REPLICATE:  oclean=clean>width32, dirty<=width32; lclean=clean; rclean==clean;
-// RHS MUST BE CLEAN CONSTANT.
-#define VL_REPLICATE_IOI(obits, lbits, rbits, ld, rep) (-(ld))  // Iff lbits==1
-#define VL_REPLICATE_QOI(obits, lbits, rbits, ld, rep) (-(static_cast<QData>(ld)))  // Iff lbits==1
-
-static inline IData VL_REPLICATE_III(int, int lbits, int, IData ld, IData rep) VL_PURE {
-    IData returndata = ld;
-    for (unsigned i = 1; i < rep; ++i) {
-        returndata = returndata << lbits;
-        returndata |= ld;
-    }
-    return returndata;
-}
-static inline QData VL_REPLICATE_QII(int, int lbits, int, IData ld, IData rep) VL_PURE {
-    QData returndata = ld;
-    for (unsigned i = 1; i < rep; ++i) {
-        returndata = returndata << lbits;
-        returndata |= static_cast<QData>(ld);
-    }
-    return returndata;
-}
-static inline WDataOutP VL_REPLICATE_WII(int obits, int lbits, int, WDataOutP owp, IData ld,
-                                         IData rep) VL_MT_SAFE {
-    owp[0] = ld;
-    for (unsigned i = 1; i < rep; ++i) {
-        _vl_insert_WI(obits, owp, ld, i * lbits + lbits - 1, i * lbits);
-    }
-    return owp;
-}
-static inline WDataOutP VL_REPLICATE_WQI(int obits, int lbits, int, WDataOutP owp, QData ld,
-                                         IData rep) VL_MT_SAFE {
-    VL_SET_WQ(owp, ld);
-    for (unsigned i = 1; i < rep; ++i) {
-        _vl_insert_WQ(obits, owp, ld, i * lbits + lbits - 1, i * lbits);
-    }
-    return owp;
-}
-static inline WDataOutP VL_REPLICATE_WWI(int obits, int lbits, int, WDataOutP owp,
-                                         WDataInP const lwp, IData rep) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(lbits); ++i) owp[i] = lwp[i];
-    for (unsigned i = 1; i < rep; ++i) {
-        _vl_insert_WW(obits, owp, lwp, i * lbits + lbits - 1, i * lbits);
-    }
-    return owp;
-}
-
-// Left stream operator. Output will always be clean. LHS and RHS must be clean.
-// Special "fast" versions for slice sizes that are a power of 2. These use
-// shifts and masks to execute faster than the slower for-loop approach where a
-// subset of bits is copied in during each iteration.
-static inline IData VL_STREAML_FAST_III(int, int lbits, int, IData ld, IData rd_log2) VL_PURE {
-    // Pre-shift bits in most-significant slice:
-    //
-    // If lbits is not a multiple of the slice size (i.e., lbits % rd != 0),
-    // then we end up with a "gap" in our reversed result. For example, if we
-    // have a 5-bit Verlilog signal (lbits=5) in an 8-bit C data type:
-    //
-    //   ld = ---43210
-    //
-    // (where numbers are the Verilog signal bit numbers and '-' is an unused bit).
-    // Executing the switch statement below with a slice size of two (rd=2,
-    // rd_log2=1) produces:
-    //
-    //   ret = 1032-400
-    //
-    // Pre-shifting the bits in the most-significant slice allows us to avoid
-    // this gap in the shuffled data:
-    //
-    //   ld_adjusted = --4-3210
-    //   ret = 10324---
-    IData ret = ld;
-    if (rd_log2) {
-        const vluint32_t lbitsFloor = lbits & ~VL_MASK_I(rd_log2);  // max multiple of rd <= lbits
-        const vluint32_t lbitsRem = lbits - lbitsFloor;  // number of bits in most-sig slice (MSS)
-        const IData msbMask = VL_MASK_I(lbitsRem) << lbitsFloor;  // mask to sel only bits in MSS
-        ret = (ret & ~msbMask) | ((ret & msbMask) << ((VL_UL(1) << rd_log2) - lbitsRem));
-    }
-    switch (rd_log2) {
-    case 0: ret = ((ret >> 1) & VL_UL(0x55555555)) | ((ret & VL_UL(0x55555555)) << 1);  // FALLTHRU
-    case 1: ret = ((ret >> 2) & VL_UL(0x33333333)) | ((ret & VL_UL(0x33333333)) << 2);  // FALLTHRU
-    case 2: ret = ((ret >> 4) & VL_UL(0x0f0f0f0f)) | ((ret & VL_UL(0x0f0f0f0f)) << 4);  // FALLTHRU
-    case 3: ret = ((ret >> 8) & VL_UL(0x00ff00ff)) | ((ret & VL_UL(0x00ff00ff)) << 8);  // FALLTHRU
-    case 4: ret = ((ret >> 16) | (ret << 16));  // FALLTHRU
-    default:;
-    }
-    return ret >> (VL_IDATASIZE - lbits);
-}
-
-static inline QData VL_STREAML_FAST_QQI(int, int lbits, int, QData ld, IData rd_log2) VL_PURE {
-    // Pre-shift bits in most-significant slice (see comment in VL_STREAML_FAST_III)
-    QData ret = ld;
-    if (rd_log2) {
-        const vluint32_t lbitsFloor = lbits & ~VL_MASK_I(rd_log2);
-        const vluint32_t lbitsRem = lbits - lbitsFloor;
-        const QData msbMask = VL_MASK_Q(lbitsRem) << lbitsFloor;
-        ret = (ret & ~msbMask) | ((ret & msbMask) << ((1ULL << rd_log2) - lbitsRem));
-    }
-    switch (rd_log2) {
-    case 0:
-        ret = (((ret >> 1) & 0x5555555555555555ULL)
-               | ((ret & 0x5555555555555555ULL) << 1));  // FALLTHRU
-    case 1:
-        ret = (((ret >> 2) & 0x3333333333333333ULL)
-               | ((ret & 0x3333333333333333ULL) << 2));  // FALLTHRU
-    case 2:
-        ret = (((ret >> 4) & 0x0f0f0f0f0f0f0f0fULL)
-               | ((ret & 0x0f0f0f0f0f0f0f0fULL) << 4));  // FALLTHRU
-    case 3:
-        ret = (((ret >> 8) & 0x00ff00ff00ff00ffULL)
-               | ((ret & 0x00ff00ff00ff00ffULL) << 8));  // FALLTHRU
-    case 4:
-        ret = (((ret >> 16) & 0x0000ffff0000ffffULL)
-               | ((ret & 0x0000ffff0000ffffULL) << 16));  // FALLTHRU
-    case 5: ret = ((ret >> 32) | (ret << 32));  // FALLTHRU
-    default:;
-    }
-    return ret >> (VL_QUADSIZE - lbits);
-}
-
-// Regular "slow" streaming operators
-static inline IData VL_STREAML_III(int, int lbits, int, IData ld, IData rd) VL_PURE {
-    IData ret = 0;
-    // Slice size should never exceed the lhs width
-    const IData mask = VL_MASK_I(rd);
-    for (int istart = 0; istart < lbits; istart += rd) {
-        int ostart = lbits - rd - istart;
-        ostart = ostart > 0 ? ostart : 0;
-        ret |= ((ld >> istart) & mask) << ostart;
-    }
-    return ret;
-}
-
-static inline QData VL_STREAML_QQI(int, int lbits, int, QData ld, IData rd) VL_PURE {
-    QData ret = 0;
-    // Slice size should never exceed the lhs width
-    const QData mask = VL_MASK_Q(rd);
-    for (int istart = 0; istart < lbits; istart += rd) {
-        int ostart = lbits - rd - istart;
-        ostart = ostart > 0 ? ostart : 0;
-        ret |= ((ld >> istart) & mask) << ostart;
-    }
-    return ret;
-}
-
-static inline WDataOutP VL_STREAML_WWI(int, int lbits, int, WDataOutP owp, WDataInP const lwp,
-                                       IData rd) VL_MT_SAFE {
-    VL_ZERO_W(lbits, owp);
-    // Slice size should never exceed the lhs width
-    const int ssize = (rd < static_cast<IData>(lbits)) ? rd : (static_cast<IData>(lbits));
-    for (int istart = 0; istart < lbits; istart += rd) {
-        int ostart = lbits - rd - istart;
-        ostart = ostart > 0 ? ostart : 0;
-        for (int sbit = 0; sbit < ssize && sbit < lbits - istart; ++sbit) {
-            // Extract a single bit from lwp and shift it to the correct
-            // location for owp.
-            EData bit = (VL_BITRSHIFT_W(lwp, (istart + sbit)) & 1) << VL_BITBIT_E(ostart + sbit);
-            owp[VL_BITWORD_E(ostart + sbit)] |= bit;
-        }
-    }
-    return owp;
-}
-
-// Because concats are common and wide, it's valuable to always have a clean output.
-// Thus we specify inputs must be clean, so we don't need to clean the output.
-// Note the bit shifts are always constants, so the adds in these constify out.
-// Casts required, as args may be 8 bit entities, and need to shift to appropriate output size
-#define VL_CONCAT_III(obits, lbits, rbits, ld, rd) \
-    (static_cast<IData>(ld) << (rbits) | static_cast<IData>(rd))
-#define VL_CONCAT_QII(obits, lbits, rbits, ld, rd) \
-    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
-#define VL_CONCAT_QIQ(obits, lbits, rbits, ld, rd) \
-    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
-#define VL_CONCAT_QQI(obits, lbits, rbits, ld, rd) \
-    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
-#define VL_CONCAT_QQQ(obits, lbits, rbits, ld, rd) \
-    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
-
-static inline WDataOutP VL_CONCAT_WII(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
-                                      IData rd) VL_MT_SAFE {
-    owp[0] = rd;
-    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WWI(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, IData rd) VL_MT_SAFE {
-    owp[0] = rd;
-    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WIW(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
-                                      WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
-    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WIQ(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
-                                      QData rd) VL_MT_SAFE {
-    VL_SET_WQ(owp, rd);
-    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WQI(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
-                                      IData rd) VL_MT_SAFE {
-    owp[0] = rd;
-    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WQQ(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
-                                      QData rd) VL_MT_SAFE {
-    VL_SET_WQ(owp, rd);
-    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
-    VL_SET_WQ(owp, rd);
-    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WQW(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
-                                      WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
-    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
-    return owp;
-}
-static inline WDataOutP VL_CONCAT_WWW(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
-    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
-    return owp;
-}
-
-//===================================================================
-// Shifts
-
-// Static shift, used by internal functions
-// The output is the same as the input - it overlaps!
-static inline void _vl_shiftl_inplace_w(int obits, WDataOutP iowp,
-                                        IData rd /*1 or 4*/) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    const EData linsmask = VL_MASK_E(rd);
-    for (int i = words - 1; i >= 1; --i) {
-        iowp[i]
-            = ((iowp[i] << rd) & ~linsmask) | ((iowp[i - 1] >> (VL_EDATASIZE - rd)) & linsmask);
-    }
-    iowp[0] = ((iowp[0] << rd) & ~linsmask);
-    iowp[VL_WORDS_I(obits) - 1] &= VL_MASK_E(obits);
-}
-
-// EMIT_RULE: VL_SHIFTL:  oclean=lclean; rclean==clean;
-// Important: Unlike most other funcs, the shift might well be a computed
-// expression.  Thus consider this when optimizing.  (And perhaps have 2 funcs?)
-static inline WDataOutP VL_SHIFTL_WWI(int obits, int, int, WDataOutP owp, WDataInP const lwp,
-                                      IData rd) VL_MT_SAFE {
-    const int word_shift = VL_BITWORD_E(rd);
-    const int bit_shift = VL_BITBIT_E(rd);
-    if (rd >= static_cast<IData>(obits)) {  // rd may be huge with MSB set
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    } else if (bit_shift == 0) {  // Aligned word shift (<<0,<<32,<<64 etc)
-        for (int i = 0; i < word_shift; ++i) owp[i] = 0;
-        for (int i = word_shift; i < VL_WORDS_I(obits); ++i) owp[i] = lwp[i - word_shift];
-    } else {
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-        _vl_insert_WW(obits, owp, lwp, obits - 1, rd);
-    }
-    return owp;
-}
-static inline WDataOutP VL_SHIFTL_WWW(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return VL_ZERO_W(obits, owp);
-        }
-    }
-    return VL_SHIFTL_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
-}
-static inline WDataOutP VL_SHIFTL_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
-    WData rwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(rwp, rd);
-    return VL_SHIFTL_WWW(obits, lbits, rbits, owp, lwp, rwp);
-}
-static inline IData VL_SHIFTL_IIW(int obits, int, int rbits, IData lhs,
-                                  WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return 0;
-        }
-    }
-    return VL_CLEAN_II(obits, obits, lhs << rwp[0]);
-}
-static inline IData VL_SHIFTL_IIQ(int obits, int, int, IData lhs, QData rhs) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs >= VL_IDATASIZE)) return 0;
-    return VL_CLEAN_II(obits, obits, lhs << rhs);
-}
-static inline QData VL_SHIFTL_QQW(int obits, int, int rbits, QData lhs,
-                                  WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return 0;
-        }
-    }
-    // Above checks rwp[1]==0 so not needed in below shift
-    return VL_CLEAN_QQ(obits, obits, lhs << (static_cast<QData>(rwp[0])));
-}
-static inline QData VL_SHIFTL_QQQ(int obits, int, int, QData lhs, QData rhs) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs >= VL_QUADSIZE)) return 0;
-    return VL_CLEAN_QQ(obits, obits, lhs << rhs);
-}
-
-// EMIT_RULE: VL_SHIFTR:  oclean=lclean; rclean==clean;
-// Important: Unlike most other funcs, the shift might well be a computed
-// expression.  Thus consider this when optimizing.  (And perhaps have 2 funcs?)
-static inline WDataOutP VL_SHIFTR_WWI(int obits, int, int, WDataOutP owp, WDataInP const lwp,
-                                      IData rd) VL_MT_SAFE {
-    const int word_shift = VL_BITWORD_E(rd);  // Maybe 0
-    const int bit_shift = VL_BITBIT_E(rd);
-    if (rd >= static_cast<IData>(obits)) {  // rd may be huge with MSB set
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    } else if (bit_shift == 0) {  // Aligned word shift (>>0,>>32,>>64 etc)
-        const int copy_words = (VL_WORDS_I(obits) - word_shift);
-        for (int i = 0; i < copy_words; ++i) owp[i] = lwp[i + word_shift];
-        for (int i = copy_words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    } else {
-        const int loffset = rd & VL_SIZEBITS_E;
-        const int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword (know
-                                                          // loffset!=0) Middle words
-        const int words = VL_WORDS_I(obits - rd);
-        for (int i = 0; i < words; ++i) {
-            owp[i] = lwp[i + word_shift] >> loffset;
-            const int upperword = i + word_shift + 1;
-            if (upperword < VL_WORDS_I(obits)) owp[i] |= lwp[upperword] << nbitsonright;
-        }
-        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    }
-    return owp;
-}
-static inline WDataOutP VL_SHIFTR_WWW(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return VL_ZERO_W(obits, owp);
-        }
-    }
-    return VL_SHIFTR_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
-}
-static inline WDataOutP VL_SHIFTR_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
-                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
-    WData rwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(rwp, rd);
-    return VL_SHIFTR_WWW(obits, lbits, rbits, owp, lwp, rwp);
-}
-
-static inline IData VL_SHIFTR_IIW(int obits, int, int rbits, IData lhs,
-                                  WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return 0;
-        }
-    }
-    return VL_CLEAN_II(obits, obits, lhs >> rwp[0]);
-}
-static inline QData VL_SHIFTR_QQW(int obits, int, int rbits, QData lhs,
-                                  WDataInP const rwp) VL_MT_SAFE {
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
-        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
-            return 0;
-        }
-    }
-    // Above checks rwp[1]==0 so not needed in below shift
-    return VL_CLEAN_QQ(obits, obits, lhs >> (static_cast<QData>(rwp[0])));
-}
-static inline IData VL_SHIFTR_IIQ(int obits, int, int, IData lhs, QData rhs) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs >= VL_IDATASIZE)) return 0;
-    return VL_CLEAN_QQ(obits, obits, lhs >> rhs);
-}
-static inline QData VL_SHIFTR_QQQ(int obits, int, int, QData lhs, QData rhs) VL_MT_SAFE {
-    if (VL_UNLIKELY(rhs >= VL_QUADSIZE)) return 0;
-    return VL_CLEAN_QQ(obits, obits, lhs >> rhs);
-}
-
-// EMIT_RULE: VL_SHIFTRS:  oclean=false; lclean=clean, rclean==clean;
-static inline IData VL_SHIFTRS_III(int obits, int lbits, int, IData lhs, IData rhs) VL_PURE {
-    // Note the C standard does not specify the >> operator as a arithmetic shift!
-    // IEEE says signed if output signed, but bit position from lbits;
-    // must use lbits for sign; lbits might != obits,
-    // an EXTEND(SHIFTRS(...)) can became a SHIFTRS(...) within same 32/64 bit word length
-    const IData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
-    const IData signext = ~(VL_MASK_I(lbits) >> rhs);  // One with bits where we've shifted "past"
-    return (lhs >> rhs) | (sign & VL_CLEAN_II(obits, obits, signext));
-}
-static inline QData VL_SHIFTRS_QQI(int obits, int lbits, int, QData lhs, IData rhs) VL_PURE {
-    const QData sign = -(lhs >> (lbits - 1));
-    const QData signext = ~(VL_MASK_Q(lbits) >> rhs);
-    return (lhs >> rhs) | (sign & VL_CLEAN_QQ(obits, obits, signext));
-}
-static inline IData VL_SHIFTRS_IQI(int obits, int lbits, int rbits, QData lhs, IData rhs) VL_PURE {
-    return static_cast<IData>(VL_SHIFTRS_QQI(obits, lbits, rbits, lhs, rhs));
-}
-static inline WDataOutP VL_SHIFTRS_WWI(int obits, int lbits, int, WDataOutP owp,
-                                       WDataInP const lwp, IData rd) VL_MT_SAFE {
-    const int word_shift = VL_BITWORD_E(rd);
-    const int bit_shift = VL_BITBIT_E(rd);
-    const int lmsw = VL_WORDS_I(obits) - 1;
-    const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
-    if (rd >= static_cast<IData>(obits)) {  // Shifting past end, sign in all of lbits
-        for (int i = 0; i <= lmsw; ++i) owp[i] = sign;
-        owp[lmsw] &= VL_MASK_E(lbits);
-    } else if (bit_shift == 0) {  // Aligned word shift (>>0,>>32,>>64 etc)
-        const int copy_words = (VL_WORDS_I(obits) - word_shift);
-        for (int i = 0; i < copy_words; ++i) owp[i] = lwp[i + word_shift];
-        if (copy_words >= 0) owp[copy_words - 1] |= ~VL_MASK_E(obits) & sign;
-        for (int i = copy_words; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
-        owp[lmsw] &= VL_MASK_E(lbits);
-    } else {
-        const int loffset = rd & VL_SIZEBITS_E;
-        int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword (know loffset!=0)
-        // Middle words
-        const int words = VL_WORDS_I(obits - rd);
-        for (int i = 0; i < words; ++i) {
-            owp[i] = lwp[i + word_shift] >> loffset;
-            const int upperword = i + word_shift + 1;
-            if (upperword < VL_WORDS_I(obits)) owp[i] |= lwp[upperword] << nbitsonright;
-        }
-        if (words) owp[words - 1] |= sign & ~VL_MASK_E(obits - loffset);
-        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
-        owp[lmsw] &= VL_MASK_E(lbits);
-    }
-    return owp;
-}
-static inline WDataOutP VL_SHIFTRS_WWW(int obits, int lbits, int rbits, WDataOutP owp,
-                                       WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
-    EData overshift = 0;  // Huge shift 1>>32 or more
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
-    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
-        const int lmsw = VL_WORDS_I(obits) - 1;
-        const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
-        for (int j = 0; j <= lmsw; ++j) owp[j] = sign;
-        owp[lmsw] &= VL_MASK_E(lbits);
-        return owp;
-    }
-    return VL_SHIFTRS_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
-}
-static inline WDataOutP VL_SHIFTRS_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
-                                       WDataInP const lwp, QData rd) VL_MT_SAFE {
-    WData rwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(rwp, rd);
-    return VL_SHIFTRS_WWW(obits, lbits, rbits, owp, lwp, rwp);
-}
-static inline IData VL_SHIFTRS_IIW(int obits, int lbits, int rbits, IData lhs,
-                                   WDataInP const rwp) VL_MT_SAFE {
-    EData overshift = 0;  // Huge shift 1>>32 or more
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
-    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
-        const IData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
-        return VL_CLEAN_II(obits, obits, sign);
-    }
-    return VL_SHIFTRS_III(obits, lbits, 32, lhs, rwp[0]);
-}
-static inline QData VL_SHIFTRS_QQW(int obits, int lbits, int rbits, QData lhs,
-                                   WDataInP const rwp) VL_MT_SAFE {
-    EData overshift = 0;  // Huge shift 1>>32 or more
-    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
-    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
-        const QData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
-        return VL_CLEAN_QQ(obits, obits, sign);
-    }
-    return VL_SHIFTRS_QQI(obits, lbits, 32, lhs, rwp[0]);
-}
-static inline IData VL_SHIFTRS_IIQ(int obits, int lbits, int rbits, IData lhs,
-                                   QData rhs) VL_MT_SAFE {
-    WData rwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(rwp, rhs);
-    return VL_SHIFTRS_IIW(obits, lbits, rbits, lhs, rwp);
-}
-static inline QData VL_SHIFTRS_QQQ(int obits, int lbits, int rbits, QData lhs, QData rhs) VL_PURE {
-    WData rwp[VL_WQ_WORDS_E];
-    VL_SET_WQ(rwp, rhs);
-    return VL_SHIFTRS_QQW(obits, lbits, rbits, lhs, rwp);
-}
-
-//===================================================================
-// Bit selection
-
-// EMIT_RULE: VL_BITSEL:  oclean=dirty; rclean==clean;
-#define VL_BITSEL_IIII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
-#define VL_BITSEL_QIII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
-#define VL_BITSEL_QQII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
-#define VL_BITSEL_IQII(obits, lbits, rbits, zbits, lhs, rhs) (static_cast<IData>((lhs) >> (rhs)))
-
-static inline IData VL_BITSEL_IWII(int, int lbits, int, int, WDataInP const lwp,
-                                   IData rd) VL_MT_SAFE {
-    int word = VL_BITWORD_E(rd);
-    if (VL_UNLIKELY(rd > static_cast<IData>(lbits))) {
-        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
-        // We return all 1's as that's more likely to find bugs (?) than 0's.
-    } else {
-        return (lwp[word] >> VL_BITBIT_E(rd));
-    }
-}
-
-// EMIT_RULE: VL_RANGE:  oclean=lclean;  out=dirty
-// <msb> & <lsb> MUST BE CLEAN (currently constant)
-#define VL_SEL_IIII(obits, lbits, rbits, tbits, lhs, lsb, width) ((lhs) >> (lsb))
-#define VL_SEL_QQII(obits, lbits, rbits, tbits, lhs, lsb, width) ((lhs) >> (lsb))
-#define VL_SEL_IQII(obits, lbits, rbits, tbits, lhs, lsb, width) \
-    (static_cast<IData>((lhs) >> (lsb)))
-
-static inline IData VL_SEL_IWII(int, int lbits, int, int, WDataInP const lwp, IData lsb,
-                                IData width) VL_MT_SAFE {
-    int msb = lsb + width - 1;
-    if (VL_UNLIKELY(msb >= lbits)) {
-        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
-    } else if (VL_BITWORD_E(msb) == VL_BITWORD_E(static_cast<int>(lsb))) {
-        return VL_BITRSHIFT_W(lwp, lsb);
-    } else {
-        // 32 bit extraction may span two words
-        int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);  // bits that come from low word
-        return ((lwp[VL_BITWORD_E(msb)] << nbitsfromlow) | VL_BITRSHIFT_W(lwp, lsb));
-    }
-}
-
-static inline QData VL_SEL_QWII(int, int lbits, int, int, WDataInP const lwp, IData lsb,
-                                IData width) VL_MT_SAFE {
-    const int msb = lsb + width - 1;
-    if (VL_UNLIKELY(msb > lbits)) {
-        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
-    } else if (VL_BITWORD_E(msb) == VL_BITWORD_E(static_cast<int>(lsb))) {
-        return VL_BITRSHIFT_W(lwp, lsb);
-    } else if (VL_BITWORD_E(msb) == 1 + VL_BITWORD_E(static_cast<int>(lsb))) {
-        const int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);
-        const QData hi = (lwp[VL_BITWORD_E(msb)]);
-        const QData lo = VL_BITRSHIFT_W(lwp, lsb);
-        return (hi << nbitsfromlow) | lo;
-    } else {
-        // 64 bit extraction may span three words
-        int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);
-        const QData hi = (lwp[VL_BITWORD_E(msb)]);
-        const QData mid = (lwp[VL_BITWORD_E(lsb) + 1]);
-        const QData lo = VL_BITRSHIFT_W(lwp, lsb);
-        return (hi << (nbitsfromlow + VL_EDATASIZE)) | (mid << nbitsfromlow) | lo;
-    }
-}
-
-static inline WDataOutP VL_SEL_WWII(int obits, int lbits, int, int, WDataOutP owp,
-                                    WDataInP const lwp, IData lsb, IData width) VL_MT_SAFE {
-    const int msb = lsb + width - 1;
-    const int word_shift = VL_BITWORD_E(lsb);
-    if (VL_UNLIKELY(msb > lbits)) {  // Outside bounds,
-        for (int i = 0; i < VL_WORDS_I(obits) - 1; ++i) owp[i] = ~0;
-        owp[VL_WORDS_I(obits) - 1] = VL_MASK_E(obits);
-    } else if (VL_BITBIT_E(lsb) == 0) {
-        // Just a word extract
-        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = lwp[i + word_shift];
-    } else {
-        // Not a _vl_insert because the bits come from any bit number and goto bit 0
-        const int loffset = lsb & VL_SIZEBITS_E;
-        const int nbitsfromlow = VL_EDATASIZE - loffset;  // bits that end up in lword (know
-                                                          // loffset!=0) Middle words
-        const int words = VL_WORDS_I(msb - lsb + 1);
-        for (int i = 0; i < words; ++i) {
-            owp[i] = lwp[i + word_shift] >> loffset;
-            const int upperword = i + word_shift + 1;
-            if (upperword <= static_cast<int>(VL_BITWORD_E(msb))) {
-                owp[i] |= lwp[upperword] << nbitsfromlow;
-            }
-        }
-        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
-    }
-    return owp;
-}
-
-//======================================================================
-// Math needing insert/select
-
-// Return QData from double (numeric)
-// EMIT_RULE: VL_RTOIROUND_Q_D:  oclean=dirty; lclean==clean/real
-static inline QData VL_RTOIROUND_Q_D(int, double lhs) VL_PURE {
-    // IEEE format: [63]=sign [62:52]=exp+1023 [51:0]=mantissa
-    // This does not need to support subnormals as they are sub-integral
-    lhs = VL_ROUND(lhs);
-    if (lhs == 0.0) return 0;
-    const QData q = VL_CVT_Q_D(lhs);
-    const int lsb = static_cast<int>((q >> 52ULL) & VL_MASK_Q(11)) - 1023 - 52;
-    const vluint64_t mantissa = (q & VL_MASK_Q(52)) | (1ULL << 52);
-    vluint64_t out = 0;
-    if (lsb < 0) {
-        out = mantissa >> -lsb;
-    } else if (lsb < 64) {
-        out = mantissa << lsb;
-    }
-    if (lhs < 0) out = -out;
-    return out;
-}
-static inline IData VL_RTOIROUND_I_D(int bits, double lhs) VL_PURE {
-    return static_cast<IData>(VL_RTOIROUND_Q_D(bits, lhs));
-}
-static inline WDataOutP VL_RTOIROUND_W_D(int obits, WDataOutP owp, double lhs) VL_PURE {
-    // IEEE format: [63]=sign [62:52]=exp+1023 [51:0]=mantissa
-    // This does not need to support subnormals as they are sub-integral
-    lhs = VL_ROUND(lhs);
-    VL_ZERO_W(obits, owp);
-    if (lhs == 0.0) return owp;
-    const QData q = VL_CVT_Q_D(lhs);
-    const int lsb = static_cast<int>((q >> 52ULL) & VL_MASK_Q(11)) - 1023 - 52;
-    const vluint64_t mantissa = (q & VL_MASK_Q(52)) | (1ULL << 52);
-    if (lsb < 0) {
-        VL_SET_WQ(owp, mantissa >> -lsb);
-    } else if (lsb < obits) {
-        _vl_insert_WQ(obits, owp, mantissa, lsb + 52, lsb);
-    }
-    if (lhs < 0) VL_NEGATE_INPLACE_W(VL_WORDS_I(obits), owp);
-    return owp;
-}
-
-//======================================================================
-// Range assignments
-
-// EMIT_RULE: VL_ASSIGNRANGE:  rclean=dirty;
-static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, CData& lhsr,
-                                     IData rhs) VL_PURE {
-    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, SData& lhsr,
-                                     IData rhs) VL_PURE {
-    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, IData& lhsr,
-                                     IData rhs) VL_PURE {
-    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_QIII(int rbits, int obits, int lsb, QData& lhsr,
-                                     IData rhs) VL_PURE {
-    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_QQII(int rbits, int obits, int lsb, QData& lhsr,
-                                     QData rhs) VL_PURE {
-    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_QIIQ(int rbits, int obits, int lsb, QData& lhsr,
-                                     QData rhs) VL_PURE {
-    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
-}
-// static inline void VL_ASSIGNSEL_IIIW(int obits, int lsb, IData& lhsr, WDataInP const rwp)
-// VL_MT_SAFE { Illegal, as lhs width >= rhs width
-static inline void VL_ASSIGNSEL_WIII(int rbits, int obits, int lsb, WDataOutP owp,
-                                     IData rhs) VL_MT_SAFE {
-    _vl_insert_WI(obits, owp, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_WIIQ(int rbits, int obits, int lsb, WDataOutP owp,
-                                     QData rhs) VL_MT_SAFE {
-    _vl_insert_WQ(obits, owp, rhs, lsb + obits - 1, lsb, rbits);
-}
-static inline void VL_ASSIGNSEL_WIIW(int rbits, int obits, int lsb, WDataOutP owp,
-                                     WDataInP const rwp) VL_MT_SAFE {
-    _vl_insert_WW(obits, owp, rwp, lsb + obits - 1, lsb, rbits);
-}
-
-//======================================================================
-// Triops
-
-static inline WDataOutP VL_COND_WIWW(int obits, int, int, int, WDataOutP owp, int cond,
-                                     WDataInP const w1p, WDataInP const w2p) VL_MT_SAFE {
-    const int words = VL_WORDS_I(obits);
-    for (int i = 0; i < words; ++i) owp[i] = cond ? w1p[i] : w2p[i];
-    return owp;
-}
-
-//======================================================================
-// Constification
-
-// VL_CONST_W_#X(int obits, WDataOutP owp, IData data0, .... IData data(#-1))
-// Sets wide vector words to specified constant words.
-// These macros are used when o might represent more words then are given as constants,
-// hence all upper words must be zeroed.
-// If changing the number of functions here, also change EMITCINLINES_NUM_CONSTW
-
-#define VL_C_END_(obits, wordsSet) \
-    for (int i = (wordsSet); i < VL_WORDS_I(obits); ++i) o[i] = 0; \
-    return o
-
-// clang-format off
-static inline WDataOutP VL_CONST_W_1X(int obits, WDataOutP o, EData d0) VL_MT_SAFE {
-    o[0] = d0;
-    VL_C_END_(obits, 1);
-}
-static inline WDataOutP VL_CONST_W_2X(int obits, WDataOutP o, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;
-    VL_C_END_(obits, 2);
-}
-static inline WDataOutP VL_CONST_W_3X(int obits, WDataOutP o, EData d2, EData d1,
-                                      EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;
-    VL_C_END_(obits,3);
-}
-static inline WDataOutP VL_CONST_W_4X(int obits, WDataOutP o,
-                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    VL_C_END_(obits,4);
-}
-static inline WDataOutP VL_CONST_W_5X(int obits, WDataOutP o,
-                                      EData d4,
-                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;
-    VL_C_END_(obits,5);
-}
-static inline WDataOutP VL_CONST_W_6X(int obits, WDataOutP o,
-                                      EData d5, EData d4,
-                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;
-    VL_C_END_(obits,6);
-}
-static inline WDataOutP VL_CONST_W_7X(int obits, WDataOutP o,
-                                      EData d6, EData d5, EData d4,
-                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;  o[6] = d6;
-    VL_C_END_(obits,7);
-}
-static inline WDataOutP VL_CONST_W_8X(int obits, WDataOutP o,
-                                      EData d7, EData d6, EData d5, EData d4,
-                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;  o[6] = d6;  o[7] = d7;
-    VL_C_END_(obits,8);
-}
-//
-static inline WDataOutP VL_CONSTHI_W_1X(int obits, int lsb, WDataOutP obase,
-                                        EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 1);
-}
-static inline WDataOutP VL_CONSTHI_W_2X(int obits, int lsb, WDataOutP obase,
-                                        EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 2);
-}
-static inline WDataOutP VL_CONSTHI_W_3X(int obits, int lsb, WDataOutP obase,
-                                        EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 3);
-}
-static inline WDataOutP VL_CONSTHI_W_4X(int obits, int lsb, WDataOutP obase,
-                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 4);
-}
-static inline WDataOutP VL_CONSTHI_W_5X(int obits, int lsb, WDataOutP obase,
-                                        EData d4,
-                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 5);
-}
-static inline WDataOutP VL_CONSTHI_W_6X(int obits, int lsb, WDataOutP obase,
-                                        EData d5, EData d4,
-                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 6);
-}
-static inline WDataOutP VL_CONSTHI_W_7X(int obits, int lsb, WDataOutP obase,
-                                        EData d6, EData d5, EData d4,
-                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;  o[6] = d6;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 7);
-}
-static inline WDataOutP VL_CONSTHI_W_8X(int obits, int lsb, WDataOutP obase,
-                                        EData d7, EData d6, EData d5, EData d4,
-                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
-    o[4] = d4;  o[5] = d5;  o[6] = d6;  o[7] = d7;
-    VL_C_END_(obits, VL_WORDS_I(lsb) + 8);
-}
-
-#undef VL_C_END_
-
-// Partial constant, lower words of vector wider than 8*32, starting at bit number lsb
-static inline void VL_CONSTLO_W_8X(int lsb, WDataOutP obase,
-                                   EData d7, EData d6, EData d5, EData d4,
-                                   EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
-    WDataOutP o = obase + VL_WORDS_I(lsb);
-    o[0] = d0; o[1] = d1; o[2] = d2; o[3] = d3; o[4] = d4; o[5] = d5; o[6] = d6; o[7] = d7;
-}
-// clang-format on
+#include "verilated_funcs.h"
 
 //======================================================================
 
+#undef VERILATOR_VERILATED_H_INTERNAL_
 #endif  // Guard
diff --git a/include/verilated_dpi.h b/include/verilated_dpi.h
index d86e741fa..c1638f5d1 100644
--- a/include/verilated_dpi.h
+++ b/include/verilated_dpi.h
@@ -28,7 +28,6 @@
 
 #include "verilatedos.h"
 #include "verilated.h"  // Also presumably included by caller
-#include "verilated_heavy.h"  // Also presumably included by caller
 #include "verilated_sym_props.h"
 
 #include "svdpi.h"
diff --git a/include/verilated_funcs.h b/include/verilated_funcs.h
new file mode 100644
index 000000000..0c11c6551
--- /dev/null
+++ b/include/verilated_funcs.h
@@ -0,0 +1,2252 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Code available from: https://verilator.org
+//
+// Copyright 2003-2021 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+///
+/// \file
+/// \brief Verilated common functions
+///
+/// verilated.h should be included instead of this file.
+///
+/// Those macro/function/variable starting or ending in _ are internal,
+/// however many of the other function/macros here are also internal.
+///
+//*************************************************************************
+
+#ifndef VERILATOR_VERILATED_FUNCS_H_
+#define VERILATOR_VERILATED_FUNCS_H_
+
+#ifndef VERILATOR_VERILATED_H_INTERNAL_
+#error "verilated_funcs.h should only be included by verilated.h"
+#endif
+
+//=========================================================================
+// Extern functions -- User may override -- See verilated.cpp
+
+/// Routine to call for $finish
+/// User code may wish to replace this function, to do so, define VL_USER_FINISH.
+/// This code does not have to be thread safe.
+/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
+extern void vl_finish(const char* filename, int linenum, const char* hier);
+
+/// Routine to call for $stop and non-fatal error
+/// User code may wish to replace this function, to do so, define VL_USER_STOP.
+/// This code does not have to be thread safe.
+/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
+extern void vl_stop(const char* filename, int linenum, const char* hier);
+
+/// Routine to call for a couple of fatal messages
+/// User code may wish to replace this function, to do so, define VL_USER_FATAL.
+/// This code does not have to be thread safe.
+/// Verilator internal code must call VL_FINISH_MT instead, which eventually calls this.
+extern void vl_fatal(const char* filename, int linenum, const char* hier, const char* msg);
+
+//=========================================================================
+// Extern functions -- Slow path
+
+/// Multithread safe wrapper for calls to $finish
+extern void VL_FINISH_MT(const char* filename, int linenum, const char* hier) VL_MT_SAFE;
+/// Multithread safe wrapper for calls to $stop
+extern void VL_STOP_MT(const char* filename, int linenum, const char* hier,
+                       bool maybe = true) VL_MT_SAFE;
+/// Multithread safe wrapper to call for a couple of fatal messages
+extern void VL_FATAL_MT(const char* filename, int linenum, const char* hier,
+                        const char* msg) VL_MT_SAFE;
+
+// clang-format off
+/// Print a string, multithread safe. Eventually VL_PRINTF will get called.
+#ifdef VL_THREADED
+extern void VL_PRINTF_MT(const char* formatp, ...) VL_ATTR_PRINTF(1) VL_MT_SAFE;
+#else
+# define VL_PRINTF_MT VL_PRINTF  // The following parens will take care of themselves
+#endif
+// clang-format on
+
+/// Print a debug message from internals with standard prefix, with printf style format
+extern void VL_DBG_MSGF(const char* formatp, ...) VL_ATTR_PRINTF(1) VL_MT_SAFE;
+
+inline IData VL_RANDOM_I(int obits) VL_MT_SAFE { return vl_rand64() & VL_MASK_I(obits); }
+inline QData VL_RANDOM_Q(int obits) VL_MT_SAFE { return vl_rand64() & VL_MASK_Q(obits); }
+#ifndef VL_NO_LEGACY
+extern WDataOutP VL_RANDOM_W(int obits, WDataOutP outwp);
+#endif
+extern IData VL_RANDOM_SEEDED_II(int obits, IData seed) VL_MT_SAFE;
+inline IData VL_URANDOM_RANGE_I(IData hi, IData lo) {
+    vluint64_t rnd = vl_rand64();
+    if (VL_LIKELY(hi > lo)) {
+        // Modulus isn't very fast but it's common that hi-low is power-of-two
+        return (rnd % (hi - lo + 1)) + lo;
+    } else {
+        return (rnd % (lo - hi + 1)) + hi;
+    }
+}
+
+// These are init time only, so slow is fine
+/// Random reset a signal of given width
+extern IData VL_RAND_RESET_I(int obits);
+/// Random reset a signal of given width
+extern QData VL_RAND_RESET_Q(int obits);
+/// Random reset a signal of given width
+extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);
+/// Zero reset a signal (slow - else use VL_ZERO_W)
+extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);
+
+#if VL_THREADED
+/// Return high-precision counter for profiling, or 0x0 if not available
+inline QData VL_RDTSC_Q() {
+    vluint64_t val;
+    VL_RDTSC(val);
+    return val;
+}
+#endif
+
+extern void VL_PRINTTIMESCALE(const char* namep, const char* timeunitp,
+                              const VerilatedContext* contextp) VL_MT_SAFE;
+
+extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP const lwp, WDataInP const rwp,
+                              bool is_modulus);
+
+extern IData VL_FGETS_IXI(int obits, void* destp, IData fpi);
+
+extern void VL_FFLUSH_I(IData fdi);
+extern IData VL_FSEEK_I(IData fdi, IData offset, IData origin);
+extern IData VL_FTELL_I(IData fdi);
+extern void VL_FCLOSE_I(IData fdi);
+
+extern IData VL_FREAD_I(int width, int array_lsb, int array_size, void* memp, IData fpi,
+                        IData start, IData count);
+
+extern void VL_WRITEF(const char* formatp, ...);
+extern void VL_FWRITEF(IData fpi, const char* formatp, ...);
+
+extern IData VL_FSCANF_IX(IData fpi, const char* formatp, ...);
+extern IData VL_SSCANF_IIX(int lbits, IData ld, const char* formatp, ...);
+extern IData VL_SSCANF_IQX(int lbits, QData ld, const char* formatp, ...);
+extern IData VL_SSCANF_IWX(int lbits, WDataInP const lwp, const char* formatp, ...);
+
+extern void VL_SFORMAT_X(int obits, CData& destr, const char* formatp, ...);
+extern void VL_SFORMAT_X(int obits, SData& destr, const char* formatp, ...);
+extern void VL_SFORMAT_X(int obits, IData& destr, const char* formatp, ...);
+extern void VL_SFORMAT_X(int obits, QData& destr, const char* formatp, ...);
+extern void VL_SFORMAT_X(int obits, void* destp, const char* formatp, ...);
+
+extern IData VL_SYSTEM_IW(int lhswords, WDataInP const lhsp);
+extern IData VL_SYSTEM_IQ(QData lhs);
+inline IData VL_SYSTEM_II(IData lhs) VL_MT_SAFE { return VL_SYSTEM_IQ(lhs); }
+
+extern IData VL_TESTPLUSARGS_I(const char* formatp);
+extern const char* vl_mc_scan_plusargs(const char* prefixp);  // PLIish
+
+//=========================================================================
+// Base macros
+
+// Return true if data[bit] set; not 0/1 return, but 0/non-zero return.
+#define VL_BITISSET_I(data, bit) ((data) & (VL_UL(1) << VL_BITBIT_I(bit)))
+#define VL_BITISSET_Q(data, bit) ((data) & (1ULL << VL_BITBIT_Q(bit)))
+#define VL_BITISSET_E(data, bit) ((data) & (VL_EUL(1) << VL_BITBIT_E(bit)))
+#define VL_BITISSET_W(data, bit) ((data)[VL_BITWORD_E(bit)] & (VL_EUL(1) << VL_BITBIT_E(bit)))
+#define VL_BITISSETLIMIT_W(data, width, bit) (((bit) < (width)) && VL_BITISSET_W(data, bit))
+
+// Shift appropriate word by bit. Does not account for wrapping between two words
+#define VL_BITRSHIFT_W(data, bit) ((data)[VL_BITWORD_E(bit)] >> VL_BITBIT_E(bit))
+
+// Create two 32-bit words from quadword
+// WData is always at least 2 words; does not clean upper bits
+#define VL_SET_WQ(owp, data) \
+    do { \
+        (owp)[0] = static_cast<IData>(data); \
+        (owp)[1] = static_cast<IData>((data) >> VL_EDATASIZE); \
+    } while (false)
+#define VL_SET_WI(owp, data) \
+    do { \
+        (owp)[0] = static_cast<IData>(data); \
+        (owp)[1] = 0; \
+    } while (false)
+#define VL_SET_QW(lwp) \
+    ((static_cast<QData>((lwp)[0])) \
+     | (static_cast<QData>((lwp)[1]) << (static_cast<QData>(VL_EDATASIZE))))
+#define VL_SET_QII(ld, rd) ((static_cast<QData>(ld) << 32ULL) | static_cast<QData>(rd))
+
+// Return FILE* from IData
+extern FILE* VL_CVT_I_FP(IData lhs) VL_MT_SAFE;
+
+// clang-format off
+// Use a union to avoid cast-to-different-size warnings
+// Return void* from QData
+static inline void* VL_CVT_Q_VP(QData lhs) VL_PURE {
+    union { void* fp; QData q; } u;
+    u.q = lhs;
+    return u.fp;
+}
+// Return QData from const void*
+static inline QData VL_CVT_VP_Q(const void* fp) VL_PURE {
+    union { const void* fp; QData q; } u;
+    u.q = 0;
+    u.fp = fp;
+    return u.q;
+}
+// Return double from QData (bits, not numerically)
+static inline double VL_CVT_D_Q(QData lhs) VL_PURE {
+    union { double d; QData q; } u;
+    u.q = lhs;
+    return u.d;
+}
+// Return QData from double (bits, not numerically)
+static inline QData VL_CVT_Q_D(double lhs) VL_PURE {
+    union { double d; QData q; } u;
+    u.d = lhs;
+    return u.q;
+}
+// clang-format on
+
+// Return double from lhs (numeric) unsigned
+double VL_ITOR_D_W(int lbits, WDataInP const lwp) VL_PURE;
+static inline double VL_ITOR_D_I(int, IData lhs) VL_PURE {
+    return static_cast<double>(static_cast<vluint32_t>(lhs));
+}
+static inline double VL_ITOR_D_Q(int, QData lhs) VL_PURE {
+    return static_cast<double>(static_cast<vluint64_t>(lhs));
+}
+// Return double from lhs (numeric) signed
+double VL_ISTOR_D_W(int lbits, WDataInP const lwp) VL_PURE;
+static inline double VL_ISTOR_D_I(int lbits, IData lhs) VL_PURE {
+    if (lbits == 32) return static_cast<double>(static_cast<vlsint32_t>(lhs));
+    VlWide<VL_WQ_WORDS_E> lwp;
+    VL_SET_WI(lwp, lhs);
+    return VL_ISTOR_D_W(lbits, lwp);
+}
+static inline double VL_ISTOR_D_Q(int lbits, QData lhs) VL_PURE {
+    if (lbits == 64) return static_cast<double>(static_cast<vlsint64_t>(lhs));
+    VlWide<VL_WQ_WORDS_E> lwp;
+    VL_SET_WQ(lwp, lhs);
+    return VL_ISTOR_D_W(lbits, lwp);
+}
+// Return QData from double (numeric)
+static inline IData VL_RTOI_I_D(double lhs) VL_PURE {
+    return static_cast<vlsint32_t>(VL_TRUNC(lhs));
+}
+
+// Sign extend such that if MSB set, we get ffff_ffff, else 0s
+// (Requires clean input)
+#define VL_SIGN_I(nbits, lhs) ((lhs) >> VL_BITBIT_I((nbits)-VL_UL(1)))
+#define VL_SIGN_Q(nbits, lhs) ((lhs) >> VL_BITBIT_Q((nbits)-1ULL))
+#define VL_SIGN_E(nbits, lhs) ((lhs) >> VL_BITBIT_E((nbits)-VL_EUL(1)))
+#define VL_SIGN_W(nbits, rwp) \
+    ((rwp)[VL_BITWORD_E((nbits)-VL_EUL(1))] >> VL_BITBIT_E((nbits)-VL_EUL(1)))
+#define VL_SIGNONES_E(nbits, lhs) (-(VL_SIGN_E(nbits, lhs)))
+
+// Sign bit extended up to MSB, doesn't include unsigned portion
+// Optimization bug in GCC 3.3 returns different bitmasks to later states for
+static inline IData VL_EXTENDSIGN_I(int lbits, IData lhs) VL_PURE {
+    return (-((lhs) & (VL_UL(1) << (lbits - 1))));
+}
+static inline QData VL_EXTENDSIGN_Q(int lbits, QData lhs) VL_PURE {
+    return (-((lhs) & (1ULL << (lbits - 1))));
+}
+
+// Debugging prints
+extern void _vl_debug_print_w(int lbits, WDataInP const iwp);
+
+//=========================================================================
+// Pli macros
+
+extern int VL_TIME_STR_CONVERT(const char* strp) VL_PURE;
+
+// These are deprecated and used only to establish the default precision/units.
+// Use Verilator timescale-override for better control.
+// clang-format off
+#ifndef VL_TIME_PRECISION
+# ifdef VL_TIME_PRECISION_STR
+#  define VL_TIME_PRECISION VL_TIME_STR_CONVERT(VL_STRINGIFY(VL_TIME_PRECISION_STR))
+# else
+#  define VL_TIME_PRECISION (-12)  ///< Timescale default units if not in Verilog - picoseconds
+# endif
+#endif
+#ifndef VL_TIME_UNIT
+# ifdef VL_TIME_UNIT_STR
+#  define VL_TIME_UNIT VL_TIME_STR_CONVERT(VL_STRINGIFY(VL_TIME_PRECISION_STR))
+# else
+#  define VL_TIME_UNIT (-12)  ///< Timescale default units if not in Verilog - picoseconds
+# endif
+#endif
+
+#if defined(SYSTEMC_VERSION)
+/// Return current simulation time
+// Already defined: extern sc_time sc_time_stamp();
+inline vluint64_t vl_time_stamp64() { return sc_time_stamp().value(); }
+#else  // Non-SystemC
+# if !defined(VL_TIME_CONTEXT) && !defined(VL_NO_LEGACY)
+#  ifdef VL_TIME_STAMP64
+// vl_time_stamp64() may be optionally defined by the user to return time.
+// On MSVC++ weak symbols are not supported so must be declared, or define
+// VL_TIME_CONTEXT.
+extern vluint64_t vl_time_stamp64() VL_ATTR_WEAK;
+#  else
+// sc_time_stamp() may be optionally defined by the user to return time.
+// On MSVC++ weak symbols are not supported so must be declared, or define
+// VL_TIME_CONTEXT.
+extern double sc_time_stamp() VL_ATTR_WEAK;  // Verilator 4.032 and newer
+inline vluint64_t vl_time_stamp64() {
+    // clang9.0.1 requires & although we really do want the weak symbol value
+    return VL_LIKELY(&sc_time_stamp) ? static_cast<vluint64_t>(sc_time_stamp()) : 0;
+}
+#  endif
+# endif
+#endif
+
+inline vluint64_t VerilatedContext::time() const VL_MT_SAFE {
+    // When using non-default context, fastest path is return time
+    if (VL_LIKELY(m_s.m_time)) return m_s.m_time;
+#if defined(SYSTEMC_VERSION) || (!defined(VL_TIME_CONTEXT) && !defined(VL_NO_LEGACY))
+    // Zero time could mean really at zero, or using callback
+    // clang9.0.1 requires & although we really do want the weak symbol value
+    if (VL_LIKELY(&vl_time_stamp64)) {  // else is weak symbol that is not defined
+        return vl_time_stamp64();
+    }
+#endif
+    return 0;
+}
+
+#define VL_TIME_Q() (Verilated::threadContextp()->time())
+#define VL_TIME_D() (static_cast<double>(VL_TIME_Q()))
+
+// Time scaled from 1-per-precision into a module's time units ("Unit"-ed, not "United")
+// Optimized assuming scale is always constant.
+// Can't use multiply in Q flavor, as might lose precision
+#define VL_TIME_UNITED_Q(scale) (VL_TIME_Q() / static_cast<QData>(scale))
+#define VL_TIME_UNITED_D(scale) (VL_TIME_D() / static_cast<double>(scale))
+
+// Return time precision as multiplier of time units
+double vl_time_multiplier(int scale) VL_PURE;
+// Return power of 10. e.g. returns 100 if n==2
+vluint64_t vl_time_pow10(int n) VL_PURE;
+
+#ifdef VL_DEBUG
+/// Evaluate statement if Verilated::debug() enabled
+# define VL_DEBUG_IF(stmt) \
+    do { \
+        if (VL_UNLIKELY(Verilated::debug())) {stmt} \
+    } while (false)
+#else
+// We intentionally do not compile the stmt to improve compile speed
+# define VL_DEBUG_IF(stmt) do {} while (false)
+#endif
+
+// clang-format on
+
+//=========================================================================
+// Functional macros/routines
+// These all take the form
+//      VL_func_IW(bits, bits, op, op)
+//      VL_func_WW(bits, bits, out, op, op)
+// The I/W indicates if it's a integer or wide for the output and each operand.
+// The bits indicate the bit width of the output and each operand.
+// If wide output, a temporary storage location is specified.
+
+//===================================================================
+// SETTING OPERATORS
+
+// Output clean
+// EMIT_RULE: VL_CLEAN:  oclean=clean; obits=lbits;
+#define VL_CLEAN_II(obits, lbits, lhs) ((lhs)&VL_MASK_I(obits))
+#define VL_CLEAN_QQ(obits, lbits, lhs) ((lhs)&VL_MASK_Q(obits))
+
+// EMIT_RULE: VL_ASSIGNCLEAN:  oclean=clean; obits==lbits;
+#define VL_ASSIGNCLEAN_W(obits, owp, lwp) VL_CLEAN_WW((obits), (obits), (owp), (lwp))
+static inline WDataOutP _vl_clean_inplace_w(int obits, WDataOutP owp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    owp[words - 1] &= VL_MASK_E(obits);
+    return owp;
+}
+static inline WDataOutP VL_CLEAN_WW(int obits, int, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    for (int i = 0; (i < (words - 1)); ++i) owp[i] = lwp[i];
+    owp[words - 1] = lwp[words - 1] & VL_MASK_E(obits);
+    return owp;
+}
+static inline WDataOutP VL_ZERO_W(int obits, WDataOutP owp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    for (int i = 0; i < words; ++i) owp[i] = 0;
+    return owp;
+}
+static inline WDataOutP VL_ALLONES_W(int obits, WDataOutP owp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    for (int i = 0; i < (words - 1); ++i) owp[i] = ~VL_EUL(0);
+    owp[words - 1] = VL_MASK_E(obits);
+    return owp;
+}
+
+// EMIT_RULE: VL_ASSIGN:  oclean=rclean; obits==lbits;
+// For now, we always have a clean rhs.
+// Note: If a ASSIGN isn't clean, use VL_ASSIGNCLEAN instead to do the same thing.
+static inline WDataOutP VL_ASSIGN_W(int obits, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    for (int i = 0; i < words; ++i) owp[i] = lwp[i];
+    return owp;
+}
+
+// EMIT_RULE: VL_ASSIGNBIT:  rclean=clean;
+static inline void VL_ASSIGNBIT_II(int, int bit, CData& lhsr, IData rhs) VL_PURE {
+    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_II(int, int bit, SData& lhsr, IData rhs) VL_PURE {
+    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_II(int, int bit, IData& lhsr, IData rhs) VL_PURE {
+    lhsr = ((lhsr & ~(VL_UL(1) << VL_BITBIT_I(bit))) | (rhs << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_QI(int, int bit, QData& lhsr, QData rhs) VL_PURE {
+    lhsr = ((lhsr & ~(1ULL << VL_BITBIT_Q(bit))) | (static_cast<QData>(rhs) << VL_BITBIT_Q(bit)));
+}
+static inline void VL_ASSIGNBIT_WI(int, int bit, WDataOutP owp, IData rhs) VL_MT_SAFE {
+    EData orig = owp[VL_BITWORD_E(bit)];
+    owp[VL_BITWORD_E(bit)] = ((orig & ~(VL_EUL(1) << VL_BITBIT_E(bit)))
+                              | (static_cast<EData>(rhs) << VL_BITBIT_E(bit)));
+}
+// Alternative form that is an instruction faster when rhs is constant one.
+static inline void VL_ASSIGNBIT_IO(int, int bit, CData& lhsr, IData) VL_PURE {
+    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_IO(int, int bit, SData& lhsr, IData) VL_PURE {
+    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_IO(int, int bit, IData& lhsr, IData) VL_PURE {
+    lhsr = (lhsr | (VL_UL(1) << VL_BITBIT_I(bit)));
+}
+static inline void VL_ASSIGNBIT_QO(int, int bit, QData& lhsr, IData) VL_PURE {
+    lhsr = (lhsr | (1ULL << VL_BITBIT_Q(bit)));
+}
+static inline void VL_ASSIGNBIT_WO(int, int bit, WDataOutP owp, IData) VL_MT_SAFE {
+    const EData orig = owp[VL_BITWORD_E(bit)];
+    owp[VL_BITWORD_E(bit)] = (orig | (VL_EUL(1) << VL_BITBIT_E(bit)));
+}
+
+//===================================================================
+// SYSTEMC OPERATORS
+// Copying verilog format to systemc integers and bit vectors.
+// Get a SystemC variable
+
+#define VL_ASSIGN_ISI(obits, vvar, svar) \
+    { (vvar) = VL_CLEAN_II((obits), (obits), (svar).read()); }
+#define VL_ASSIGN_QSQ(obits, vvar, svar) \
+    { (vvar) = VL_CLEAN_QQ((obits), (obits), (svar).read()); }
+
+#define VL_ASSIGN_ISW(obits, od, svar) \
+    { (od) = ((svar).read().get_word(0)) & VL_MASK_I(obits); }
+#define VL_ASSIGN_QSW(obits, od, svar) \
+    { \
+        (od) = ((static_cast<QData>((svar).read().get_word(1))) << VL_IDATASIZE \
+                | (svar).read().get_word(0)) \
+               & VL_MASK_Q(obits); \
+    }
+#define VL_ASSIGN_WSW(obits, owp, svar) \
+    { \
+        const int words = VL_WORDS_I(obits); \
+        for (int i = 0; i < words; ++i) (owp)[i] = (svar).read().get_word(i); \
+        (owp)[words - 1] &= VL_MASK_E(obits); \
+    }
+
+#define VL_ASSIGN_ISU(obits, vvar, svar) \
+    { (vvar) = VL_CLEAN_II((obits), (obits), (svar).read().to_uint()); }
+#define VL_ASSIGN_QSU(obits, vvar, svar) \
+    { (vvar) = VL_CLEAN_QQ((obits), (obits), (svar).read().to_uint64()); }
+#define VL_ASSIGN_WSB(obits, owp, svar) \
+    { \
+        const int words = VL_WORDS_I(obits); \
+        sc_biguint<(obits)> _butemp = (svar).read(); \
+        for (int i = 0; i < words; ++i) { \
+            int msb = ((i + 1) * VL_IDATASIZE) - 1; \
+            msb = (msb >= (obits)) ? ((obits)-1) : msb; \
+            (owp)[i] = _butemp.range(msb, i * VL_IDATASIZE).to_uint(); \
+        } \
+        (owp)[words - 1] &= VL_MASK_E(obits); \
+    }
+
+// Copying verilog format from systemc integers and bit vectors.
+// Set a SystemC variable
+
+#define VL_ASSIGN_SII(obits, svar, vvar) \
+    { (svar).write(vvar); }
+#define VL_ASSIGN_SQQ(obits, svar, vvar) \
+    { (svar).write(vvar); }
+
+#define VL_ASSIGN_SWI(obits, svar, rd) \
+    { \
+        sc_bv<(obits)> _bvtemp; \
+        _bvtemp.set_word(0, (rd)); \
+        (svar).write(_bvtemp); \
+    }
+#define VL_ASSIGN_SWQ(obits, svar, rd) \
+    { \
+        sc_bv<(obits)> _bvtemp; \
+        _bvtemp.set_word(0, static_cast<IData>(rd)); \
+        _bvtemp.set_word(1, static_cast<IData>((rd) >> VL_IDATASIZE)); \
+        (svar).write(_bvtemp); \
+    }
+#define VL_ASSIGN_SWW(obits, svar, rwp) \
+    { \
+        sc_bv<(obits)> _bvtemp; \
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) _bvtemp.set_word(i, (rwp)[i]); \
+        (svar).write(_bvtemp); \
+    }
+
+#define VL_ASSIGN_SUI(obits, svar, rd) \
+    { (svar).write(rd); }
+#define VL_ASSIGN_SUQ(obits, svar, rd) \
+    { (svar).write(rd); }
+#define VL_ASSIGN_SBI(obits, svar, rd) \
+    { (svar).write(rd); }
+#define VL_ASSIGN_SBQ(obits, svar, rd) \
+    { (svar).write(rd); }
+#define VL_ASSIGN_SBW(obits, svar, rwp) \
+    { \
+        sc_biguint<(obits)> _butemp; \
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) { \
+            int msb = ((i + 1) * VL_IDATASIZE) - 1; \
+            msb = (msb >= (obits)) ? ((obits)-1) : msb; \
+            _butemp.range(msb, i* VL_IDATASIZE) = (rwp)[i]; \
+        } \
+        (svar).write(_butemp); \
+    }
+
+//===================================================================
+// Extending sizes
+
+// CAREFUL, we're width changing, so obits!=lbits
+
+// Right must be clean because otherwise size increase would pick up bad bits
+// EMIT_RULE: VL_EXTEND:  oclean=clean; rclean==clean;
+#define VL_EXTEND_II(obits, lbits, lhs) ((lhs))
+#define VL_EXTEND_QI(obits, lbits, lhs) (static_cast<QData>(lhs))
+#define VL_EXTEND_QQ(obits, lbits, lhs) ((lhs))
+
+static inline WDataOutP VL_EXTEND_WI(int obits, int, WDataOutP owp, IData ld) VL_MT_SAFE {
+    // Note for extracts that obits != lbits
+    owp[0] = ld;
+    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    return owp;
+}
+static inline WDataOutP VL_EXTEND_WQ(int obits, int, WDataOutP owp, QData ld) VL_MT_SAFE {
+    VL_SET_WQ(owp, ld);
+    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    return owp;
+}
+static inline WDataOutP VL_EXTEND_WW(int obits, int lbits, WDataOutP owp,
+                                     WDataInP const lwp) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(lbits); ++i) owp[i] = lwp[i];
+    for (int i = VL_WORDS_I(lbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    return owp;
+}
+
+// EMIT_RULE: VL_EXTENDS:  oclean=*dirty*; obits=lbits;
+// Sign extension; output dirty
+static inline IData VL_EXTENDS_II(int, int lbits, IData lhs) VL_PURE {
+    return VL_EXTENDSIGN_I(lbits, lhs) | lhs;
+}
+static inline QData VL_EXTENDS_QI(int, int lbits, QData lhs /*Q_as_need_extended*/) VL_PURE {
+    return VL_EXTENDSIGN_Q(lbits, lhs) | lhs;
+}
+static inline QData VL_EXTENDS_QQ(int, int lbits, QData lhs) VL_PURE {
+    return VL_EXTENDSIGN_Q(lbits, lhs) | lhs;
+}
+
+static inline WDataOutP VL_EXTENDS_WI(int obits, int lbits, WDataOutP owp, IData ld) VL_MT_SAFE {
+    const EData sign = VL_SIGNONES_E(lbits, static_cast<EData>(ld));
+    owp[0] = ld | (sign & ~VL_MASK_E(lbits));
+    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
+    return owp;
+}
+static inline WDataOutP VL_EXTENDS_WQ(int obits, int lbits, WDataOutP owp, QData ld) VL_MT_SAFE {
+    VL_SET_WQ(owp, ld);
+    const EData sign = VL_SIGNONES_E(lbits, owp[1]);
+    owp[1] |= sign & ~VL_MASK_E(lbits);
+    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
+    return owp;
+}
+static inline WDataOutP VL_EXTENDS_WW(int obits, int lbits, WDataOutP owp,
+                                      WDataInP const lwp) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(lbits) - 1; ++i) owp[i] = lwp[i];
+    const int lmsw = VL_WORDS_I(lbits) - 1;
+    const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
+    owp[lmsw] = lwp[lmsw] | (sign & ~VL_MASK_E(lbits));
+    for (int i = VL_WORDS_I(lbits); i < VL_WORDS_I(obits); ++i) owp[i] = sign;
+    return owp;
+}
+
+//===================================================================
+// REDUCTION OPERATORS
+
+// EMIT_RULE: VL_REDAND:  oclean=clean; lclean==clean; obits=1;
+#define VL_REDAND_II(obits, lbits, lhs) ((lhs) == VL_MASK_I(lbits))
+#define VL_REDAND_IQ(obits, lbits, lhs) ((lhs) == VL_MASK_Q(lbits))
+static inline IData VL_REDAND_IW(int, int lbits, WDataInP const lwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(lbits);
+    EData combine = lwp[0];
+    for (int i = 1; i < words - 1; ++i) combine &= lwp[i];
+    combine &= ~VL_MASK_E(lbits) | lwp[words - 1];
+    return ((~combine) == 0);
+}
+
+// EMIT_RULE: VL_REDOR:  oclean=clean; lclean==clean; obits=1;
+#define VL_REDOR_I(lhs) ((lhs) != 0)
+#define VL_REDOR_Q(lhs) ((lhs) != 0)
+static inline IData VL_REDOR_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    EData equal = 0;
+    for (int i = 0; i < words; ++i) equal |= lwp[i];
+    return (equal != 0);
+}
+
+// EMIT_RULE: VL_REDXOR:  oclean=dirty; obits=1;
+static inline IData VL_REDXOR_2(IData r) VL_PURE {
+    // Experiments show VL_REDXOR_2 is faster than __builtin_parityl
+    r = (r ^ (r >> 1));
+    return r;
+}
+static inline IData VL_REDXOR_4(IData r) VL_PURE {
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
+    return __builtin_parityl(r);
+#else
+    r = (r ^ (r >> 1));
+    r = (r ^ (r >> 2));
+    return r;
+#endif
+}
+static inline IData VL_REDXOR_8(IData r) VL_PURE {
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
+    return __builtin_parityl(r);
+#else
+    r = (r ^ (r >> 1));
+    r = (r ^ (r >> 2));
+    r = (r ^ (r >> 4));
+    return r;
+#endif
+}
+static inline IData VL_REDXOR_16(IData r) VL_PURE {
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
+    return __builtin_parityl(r);
+#else
+    r = (r ^ (r >> 1));
+    r = (r ^ (r >> 2));
+    r = (r ^ (r >> 4));
+    r = (r ^ (r >> 8));
+    return r;
+#endif
+}
+static inline IData VL_REDXOR_32(IData r) VL_PURE {
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
+    return __builtin_parityl(r);
+#else
+    r = (r ^ (r >> 1));
+    r = (r ^ (r >> 2));
+    r = (r ^ (r >> 4));
+    r = (r ^ (r >> 8));
+    r = (r ^ (r >> 16));
+    return r;
+#endif
+}
+static inline IData VL_REDXOR_64(QData r) VL_PURE {
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(VL_NO_BUILTINS)
+    return __builtin_parityll(r);
+#else
+    r = (r ^ (r >> 1));
+    r = (r ^ (r >> 2));
+    r = (r ^ (r >> 4));
+    r = (r ^ (r >> 8));
+    r = (r ^ (r >> 16));
+    r = (r ^ (r >> 32));
+    return static_cast<IData>(r);
+#endif
+}
+static inline IData VL_REDXOR_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    EData r = lwp[0];
+    for (int i = 1; i < words; ++i) r ^= lwp[i];
+    return VL_REDXOR_32(r);
+}
+
+// EMIT_RULE: VL_COUNTONES_II:  oclean = false; lhs clean
+static inline IData VL_COUNTONES_I(IData lhs) VL_PURE {
+    // This is faster than __builtin_popcountl
+    IData r = lhs - ((lhs >> 1) & 033333333333) - ((lhs >> 2) & 011111111111);
+    r = (r + (r >> 3)) & 030707070707;
+    r = (r + (r >> 6));
+    r = (r + (r >> 12) + (r >> 24)) & 077;
+    return r;
+}
+static inline IData VL_COUNTONES_Q(QData lhs) VL_PURE {
+    return VL_COUNTONES_I(static_cast<IData>(lhs)) + VL_COUNTONES_I(static_cast<IData>(lhs >> 32));
+}
+#define VL_COUNTONES_E VL_COUNTONES_I
+static inline IData VL_COUNTONES_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    EData r = 0;
+    for (int i = 0; i < words; ++i) r += VL_COUNTONES_E(lwp[i]);
+    return r;
+}
+
+// EMIT_RULE: VL_COUNTBITS_II:  oclean = false; lhs clean
+static inline IData VL_COUNTBITS_I(int lbits, IData lhs, IData ctrl0, IData ctrl1,
+                                   IData ctrl2) VL_PURE {
+    int ctrlSum = (ctrl0 & 0x1) + (ctrl1 & 0x1) + (ctrl2 & 0x1);
+    if (ctrlSum == 3) {
+        return VL_COUNTONES_I(lhs);
+    } else if (ctrlSum == 0) {
+        IData mask = (lbits == 32) ? -1 : ((1 << lbits) - 1);
+        return VL_COUNTONES_I(~lhs & mask);
+    } else {
+        return (lbits == 32) ? 32 : lbits;
+    }
+}
+static inline IData VL_COUNTBITS_Q(int lbits, QData lhs, IData ctrl0, IData ctrl1,
+                                   IData ctrl2) VL_PURE {
+    return VL_COUNTBITS_I(32, static_cast<IData>(lhs), ctrl0, ctrl1, ctrl2)
+           + VL_COUNTBITS_I(lbits - 32, static_cast<IData>(lhs >> 32), ctrl0, ctrl1, ctrl2);
+}
+#define VL_COUNTBITS_E VL_COUNTBITS_I
+static inline IData VL_COUNTBITS_W(int lbits, int words, WDataInP const lwp, IData ctrl0,
+                                   IData ctrl1, IData ctrl2) VL_MT_SAFE {
+    EData r = 0;
+    IData wordLbits = 32;
+    for (int i = 0; i < words; ++i) {
+        if (i == words - 1) wordLbits = lbits % 32;
+        r += VL_COUNTBITS_E(wordLbits, lwp[i], ctrl0, ctrl1, ctrl2);
+    }
+    return r;
+}
+
+static inline IData VL_ONEHOT_I(IData lhs) VL_PURE {
+    return (((lhs & (lhs - 1)) == 0) & (lhs != 0));
+}
+static inline IData VL_ONEHOT_Q(QData lhs) VL_PURE {
+    return (((lhs & (lhs - 1)) == 0) & (lhs != 0));
+}
+static inline IData VL_ONEHOT_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    EData one = 0;
+    for (int i = 0; (i < words); ++i) {
+        if (lwp[i]) {
+            if (one) return 0;
+            one = 1;
+            if (lwp[i] & (lwp[i] - 1)) return 0;
+        }
+    }
+    return one;
+}
+
+static inline IData VL_ONEHOT0_I(IData lhs) VL_PURE { return ((lhs & (lhs - 1)) == 0); }
+static inline IData VL_ONEHOT0_Q(QData lhs) VL_PURE { return ((lhs & (lhs - 1)) == 0); }
+static inline IData VL_ONEHOT0_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    bool one = false;
+    for (int i = 0; (i < words); ++i) {
+        if (lwp[i]) {
+            if (one) return 0;
+            one = true;
+            if (lwp[i] & (lwp[i] - 1)) return 0;
+        }
+    }
+    return 1;
+}
+
+static inline IData VL_CLOG2_I(IData lhs) VL_PURE {
+    // There are faster algorithms, or fls GCC4 builtins, but rarely used
+    if (VL_UNLIKELY(!lhs)) return 0;
+    --lhs;
+    int shifts = 0;
+    for (; lhs != 0; ++shifts) lhs = lhs >> 1;
+    return shifts;
+}
+static inline IData VL_CLOG2_Q(QData lhs) VL_PURE {
+    if (VL_UNLIKELY(!lhs)) return 0;
+    --lhs;
+    int shifts = 0;
+    for (; lhs != 0; ++shifts) lhs = lhs >> 1ULL;
+    return shifts;
+}
+static inline IData VL_CLOG2_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    EData adjust = (VL_COUNTONES_W(words, lwp) == 1) ? 0 : 1;
+    for (int i = words - 1; i >= 0; --i) {
+        if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
+            for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
+                if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) {
+                    return i * VL_EDATASIZE + bit + adjust;
+                }
+            }
+            // Can't get here - one bit must be set
+        }
+    }
+    return 0;
+}
+
+static inline IData VL_MOSTSETBITP1_W(int words, WDataInP const lwp) VL_MT_SAFE {
+    // MSB set bit plus one; similar to FLS.  0=value is zero
+    for (int i = words - 1; i >= 0; --i) {
+        if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
+            for (int bit = VL_EDATASIZE - 1; bit >= 0; --bit) {
+                if (VL_UNLIKELY(VL_BITISSET_E(lwp[i], bit))) return i * VL_EDATASIZE + bit + 1;
+            }
+            // Can't get here - one bit must be set
+        }
+    }
+    return 0;
+}
+
+//===================================================================
+// SIMPLE LOGICAL OPERATORS
+
+// EMIT_RULE: VL_AND:  oclean=lclean||rclean; obits=lbits; lbits==rbits;
+static inline WDataOutP VL_AND_W(int words, WDataOutP owp, WDataInP const lwp,
+                                 WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] & rwp[i]);
+    return owp;
+}
+// EMIT_RULE: VL_OR:   oclean=lclean&&rclean; obits=lbits; lbits==rbits;
+static inline WDataOutP VL_OR_W(int words, WDataOutP owp, WDataInP const lwp,
+                                WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] | rwp[i]);
+    return owp;
+}
+// EMIT_RULE: VL_CHANGEXOR:  oclean=1; obits=32; lbits==rbits;
+static inline IData VL_CHANGEXOR_W(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    IData od = 0;
+    for (int i = 0; (i < words); ++i) od |= (lwp[i] ^ rwp[i]);
+    return od;
+}
+// EMIT_RULE: VL_XOR:  oclean=lclean&&rclean; obits=lbits; lbits==rbits;
+static inline WDataOutP VL_XOR_W(int words, WDataOutP owp, WDataInP const lwp,
+                                 WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; (i < words); ++i) owp[i] = (lwp[i] ^ rwp[i]);
+    return owp;
+}
+// EMIT_RULE: VL_NOT:  oclean=dirty; obits=lbits;
+static inline WDataOutP VL_NOT_W(int words, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
+    for (int i = 0; i < words; ++i) owp[i] = ~(lwp[i]);
+    return owp;
+}
+
+//=========================================================================
+// Logical comparisons
+
+// EMIT_RULE: VL_EQ:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+// EMIT_RULE: VL_NEQ: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+// EMIT_RULE: VL_LT:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+// EMIT_RULE: VL_GT:  oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+// EMIT_RULE: VL_GTE: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+// EMIT_RULE: VL_LTE: oclean=clean; lclean==clean; rclean==clean; obits=1; lbits==rbits;
+#define VL_NEQ_W(words, lwp, rwp) (!VL_EQ_W(words, lwp, rwp))
+#define VL_LT_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) < 0)
+#define VL_LTE_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) <= 0)
+#define VL_GT_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) > 0)
+#define VL_GTE_W(words, lwp, rwp) (_vl_cmp_w(words, lwp, rwp) >= 0)
+
+// Output clean, <lhs> AND <rhs> MUST BE CLEAN
+static inline IData VL_EQ_W(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    EData nequal = 0;
+    for (int i = 0; (i < words); ++i) nequal |= (lwp[i] ^ rwp[i]);
+    return (nequal == 0);
+}
+
+// Internal usage
+static inline int _vl_cmp_w(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    for (int i = words - 1; i >= 0; --i) {
+        if (lwp[i] > rwp[i]) return 1;
+        if (lwp[i] < rwp[i]) return -1;
+    }
+    return 0;  // ==
+}
+
+#define VL_LTS_IWW(obits, lbits, rbbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) < 0)
+#define VL_LTES_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) <= 0)
+#define VL_GTS_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) > 0)
+#define VL_GTES_IWW(obits, lbits, rbits, lwp, rwp) (_vl_cmps_w(lbits, lwp, rwp) >= 0)
+
+static inline IData VL_GTS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    // For lbits==32, this becomes just a single instruction, otherwise ~5.
+    // GCC 3.3.4 sign extension bugs on AMD64 architecture force us to use quad logic
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
+    return lhs_signed > rhs_signed;
+}
+static inline IData VL_GTS_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
+    return lhs_signed > rhs_signed;
+}
+
+static inline IData VL_GTES_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
+    return lhs_signed >= rhs_signed;
+}
+static inline IData VL_GTES_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
+    return lhs_signed >= rhs_signed;
+}
+
+static inline IData VL_LTS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
+    return lhs_signed < rhs_signed;
+}
+static inline IData VL_LTS_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
+    return lhs_signed < rhs_signed;
+}
+
+static inline IData VL_LTES_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);  // Q for gcc
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);  // Q for gcc
+    return lhs_signed <= rhs_signed;
+}
+static inline IData VL_LTES_IQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
+    return lhs_signed <= rhs_signed;
+}
+
+static inline int _vl_cmps_w(int lbits, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(lbits);
+    int i = words - 1;
+    // We need to flip sense if negative comparison
+    const EData lsign = VL_SIGN_E(lbits, lwp[i]);
+    const EData rsign = VL_SIGN_E(lbits, rwp[i]);
+    if (!lsign && rsign) return 1;  // + > -
+    if (lsign && !rsign) return -1;  // - < +
+    for (; i >= 0; --i) {
+        if (lwp[i] > rwp[i]) return 1;
+        if (lwp[i] < rwp[i]) return -1;
+    }
+    return 0;  // ==
+}
+
+//=========================================================================
+// Math
+
+// Output NOT clean
+static inline WDataOutP VL_NEGATE_W(int words, WDataOutP owp, WDataInP const lwp) VL_MT_SAFE {
+    EData carry = 1;
+    for (int i = 0; i < words; ++i) {
+        owp[i] = ~lwp[i] + carry;
+        carry = (owp[i] < ~lwp[i]);
+    }
+    return owp;
+}
+static inline void VL_NEGATE_INPLACE_W(int words, WDataOutP owp_lwp) VL_MT_SAFE {
+    EData carry = 1;
+    for (int i = 0; i < words; ++i) {
+        EData word = ~owp_lwp[i] + carry;
+        carry = (word < ~owp_lwp[i]);
+        owp_lwp[i] = word;
+    }
+}
+
+// EMIT_RULE: VL_MUL:    oclean=dirty; lclean==clean; rclean==clean;
+// EMIT_RULE: VL_DIV:    oclean=dirty; lclean==clean; rclean==clean;
+// EMIT_RULE: VL_MODDIV: oclean=dirty; lclean==clean; rclean==clean;
+#define VL_DIV_III(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) / (rhs))
+#define VL_DIV_QQQ(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) / (rhs))
+#define VL_DIV_WWW(lbits, owp, lwp, rwp) (_vl_moddiv_w(lbits, owp, lwp, rwp, 0))
+#define VL_MODDIV_III(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) % (rhs))
+#define VL_MODDIV_QQQ(lbits, lhs, rhs) (((rhs) == 0) ? 0 : (lhs) % (rhs))
+#define VL_MODDIV_WWW(lbits, owp, lwp, rwp) (_vl_moddiv_w(lbits, owp, lwp, rwp, 1))
+
+static inline WDataOutP VL_ADD_W(int words, WDataOutP owp, WDataInP const lwp,
+                                 WDataInP const rwp) VL_MT_SAFE {
+    QData carry = 0;
+    for (int i = 0; i < words; ++i) {
+        carry = carry + static_cast<QData>(lwp[i]) + static_cast<QData>(rwp[i]);
+        owp[i] = (carry & 0xffffffffULL);
+        carry = (carry >> 32ULL) & 0xffffffffULL;
+    }
+    // Last output word is dirty
+    return owp;
+}
+
+static inline WDataOutP VL_SUB_W(int words, WDataOutP owp, WDataInP const lwp,
+                                 WDataInP const rwp) VL_MT_SAFE {
+    QData carry = 0;
+    for (int i = 0; i < words; ++i) {
+        carry = (carry + static_cast<QData>(lwp[i])
+                 + static_cast<QData>(static_cast<IData>(~rwp[i])));
+        if (i == 0) ++carry;  // Negation of rwp
+        owp[i] = (carry & 0xffffffffULL);
+        carry = (carry >> 32ULL) & 0xffffffffULL;
+    }
+    // Last output word is dirty
+    return owp;
+}
+
+static inline WDataOutP VL_MUL_W(int words, WDataOutP owp, WDataInP const lwp,
+                                 WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; i < words; ++i) owp[i] = 0;
+    for (int lword = 0; lword < words; ++lword) {
+        for (int rword = 0; rword < words; ++rword) {
+            QData mul = static_cast<QData>(lwp[lword]) * static_cast<QData>(rwp[rword]);
+            for (int qword = lword + rword; qword < words; ++qword) {
+                mul += static_cast<QData>(owp[qword]);
+                owp[qword] = (mul & 0xffffffffULL);
+                mul = (mul >> 32ULL) & 0xffffffffULL;
+            }
+        }
+    }
+    // Last output word is dirty
+    return owp;
+}
+
+static inline IData VL_MULS_III(int, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    const vlsint32_t lhs_signed = VL_EXTENDS_II(32, lbits, lhs);
+    const vlsint32_t rhs_signed = VL_EXTENDS_II(32, lbits, rhs);
+    return lhs_signed * rhs_signed;
+}
+static inline QData VL_MULS_QQQ(int, int lbits, int, QData lhs, QData rhs) VL_PURE {
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
+    return lhs_signed * rhs_signed;
+}
+
+static inline WDataOutP VL_MULS_WWW(int, int lbits, int, WDataOutP owp, WDataInP const lwp,
+                                    WDataInP const rwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(lbits);
+    // cppcheck-suppress variableScope
+    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
+    // cppcheck-suppress variableScope
+    WData rwstore[VL_MULS_MAX_WORDS];
+    WDataInP lwusp = lwp;
+    WDataInP rwusp = rwp;
+    EData lneg = VL_SIGN_E(lbits, lwp[words - 1]);
+    if (lneg) {  // Negate lhs
+        lwusp = lwstore;
+        VL_NEGATE_W(words, lwstore, lwp);
+        lwstore[words - 1] &= VL_MASK_E(lbits);  // Clean it
+    }
+    EData rneg = VL_SIGN_E(lbits, rwp[words - 1]);
+    if (rneg) {  // Negate rhs
+        rwusp = rwstore;
+        VL_NEGATE_W(words, rwstore, rwp);
+        rwstore[words - 1] &= VL_MASK_E(lbits);  // Clean it
+    }
+    VL_MUL_W(words, owp, lwusp, rwusp);
+    owp[words - 1] &= VL_MASK_E(
+        lbits);  // Clean.  Note it's ok for the multiply to overflow into the sign bit
+    if ((lneg ^ rneg) & 1) {  // Negate output (not using NEGATE, as owp==lwp)
+        QData carry = 0;
+        for (int i = 0; i < words; ++i) {
+            carry = carry + static_cast<QData>(static_cast<IData>(~owp[i]));
+            if (i == 0) ++carry;  // Negation of temp2
+            owp[i] = (carry & 0xffffffffULL);
+            carry = (carry >> 32ULL) & 0xffffffffULL;
+        }
+        // Not needed: owp[words-1] |= 1<<VL_BITBIT_E(lbits-1);  // Set sign bit
+    }
+    // Last output word is dirty
+    return owp;
+}
+
+static inline IData VL_DIVS_III(int lbits, IData lhs, IData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 0;
+    // -MAX / -1 cannot be represented in twos complement, and will cause SIGFPE
+    if (VL_UNLIKELY(lhs == 0x80000000 && rhs == 0xffffffff)) return 0;
+    const vlsint32_t lhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, lhs);
+    const vlsint32_t rhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, rhs);
+    return lhs_signed / rhs_signed;
+}
+static inline QData VL_DIVS_QQQ(int lbits, QData lhs, QData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 0;
+    // -MAX / -1 cannot be represented in twos complement, and will cause SIGFPE
+    if (VL_UNLIKELY(lhs == 0x8000000000000000ULL && rhs == 0xffffffffffffffffULL)) return 0;
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, rhs);
+    return lhs_signed / rhs_signed;
+}
+static inline IData VL_MODDIVS_III(int lbits, IData lhs, IData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 0;
+    if (VL_UNLIKELY(lhs == 0x80000000 && rhs == 0xffffffff)) return 0;
+    const vlsint32_t lhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, lhs);
+    const vlsint32_t rhs_signed = VL_EXTENDS_II(VL_IDATASIZE, lbits, rhs);
+    return lhs_signed % rhs_signed;
+}
+static inline QData VL_MODDIVS_QQQ(int lbits, QData lhs, QData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 0;
+    if (VL_UNLIKELY(lhs == 0x8000000000000000ULL && rhs == 0xffffffffffffffffULL)) return 0;
+    const vlsint64_t lhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, lhs);
+    const vlsint64_t rhs_signed = VL_EXTENDS_QQ(VL_QUADSIZE, lbits, rhs);
+    return lhs_signed % rhs_signed;
+}
+
+static inline WDataOutP VL_DIVS_WWW(int lbits, WDataOutP owp, WDataInP const lwp,
+                                    WDataInP const rwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(lbits);
+    const EData lsign = VL_SIGN_E(lbits, lwp[words - 1]);
+    const EData rsign = VL_SIGN_E(lbits, rwp[words - 1]);
+    // cppcheck-suppress variableScope
+    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
+    // cppcheck-suppress variableScope
+    WData rwstore[VL_MULS_MAX_WORDS];
+    WDataInP ltup = lwp;
+    WDataInP rtup = rwp;
+    if (lsign) ltup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), lwstore, lwp));
+    if (rsign) rtup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), rwstore, rwp));
+    if ((lsign && !rsign) || (!lsign && rsign)) {
+        WData qNoSign[VL_MULS_MAX_WORDS];
+        VL_DIV_WWW(lbits, qNoSign, ltup, rtup);
+        _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), owp, qNoSign));
+        return owp;
+    } else {
+        return VL_DIV_WWW(lbits, owp, ltup, rtup);
+    }
+}
+static inline WDataOutP VL_MODDIVS_WWW(int lbits, WDataOutP owp, WDataInP const lwp,
+                                       WDataInP const rwp) VL_MT_SAFE {
+    const int words = VL_WORDS_I(lbits);
+    const EData lsign = VL_SIGN_E(lbits, lwp[words - 1]);
+    const EData rsign = VL_SIGN_E(lbits, rwp[words - 1]);
+    // cppcheck-suppress variableScope
+    WData lwstore[VL_MULS_MAX_WORDS];  // Fixed size, as MSVC++ doesn't allow [words] here
+    // cppcheck-suppress variableScope
+    WData rwstore[VL_MULS_MAX_WORDS];
+    WDataInP ltup = lwp;
+    WDataInP rtup = rwp;
+    if (lsign) ltup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), lwstore, lwp));
+    if (rsign) rtup = _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), rwstore, rwp));
+    if (lsign) {  // Only dividend sign matters for modulus
+        WData qNoSign[VL_MULS_MAX_WORDS];
+        VL_MODDIV_WWW(lbits, qNoSign, ltup, rtup);
+        _vl_clean_inplace_w(lbits, VL_NEGATE_W(VL_WORDS_I(lbits), owp, qNoSign));
+        return owp;
+    } else {
+        return VL_MODDIV_WWW(lbits, owp, ltup, rtup);
+    }
+}
+
+#define VL_POW_IIQ(obits, lbits, rbits, lhs, rhs) VL_POW_QQQ(obits, lbits, rbits, lhs, rhs)
+#define VL_POW_IIW(obits, lbits, rbits, lhs, rwp) VL_POW_QQW(obits, lbits, rbits, lhs, rwp)
+#define VL_POW_QQI(obits, lbits, rbits, lhs, rhs) VL_POW_QQQ(obits, lbits, rbits, lhs, rhs)
+#define VL_POW_WWI(obits, lbits, rbits, owp, lwp, rhs) \
+    VL_POW_WWQ(obits, lbits, rbits, owp, lwp, rhs)
+
+static inline IData VL_POW_III(int, int, int rbits, IData lhs, IData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 1;
+    if (VL_UNLIKELY(lhs == 0)) return 0;
+    IData power = lhs;
+    IData out = 1;
+    for (int i = 0; i < rbits; ++i) {
+        if (i > 0) power = power * power;
+        if (rhs & (1ULL << i)) out *= power;
+    }
+    return out;
+}
+static inline QData VL_POW_QQQ(int, int, int rbits, QData lhs, QData rhs) VL_PURE {
+    if (VL_UNLIKELY(rhs == 0)) return 1;
+    if (VL_UNLIKELY(lhs == 0)) return 0;
+    QData power = lhs;
+    QData out = 1ULL;
+    for (int i = 0; i < rbits; ++i) {
+        if (i > 0) power = power * power;
+        if (rhs & (1ULL << i)) out *= power;
+    }
+    return out;
+}
+WDataOutP VL_POW_WWW(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp,
+                     WDataInP const rwp);
+WDataOutP VL_POW_WWQ(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp, QData rhs);
+QData VL_POW_QQW(int obits, int, int rbits, QData lhs, WDataInP const rwp);
+
+#define VL_POWSS_IIQ(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
+    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
+#define VL_POWSS_IIQ(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
+    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
+#define VL_POWSS_IIW(obits, lbits, rbits, lhs, rwp, lsign, rsign) \
+    VL_POWSS_QQW(obits, lbits, rbits, lhs, rwp, lsign, rsign)
+#define VL_POWSS_QQI(obits, lbits, rbits, lhs, rhs, lsign, rsign) \
+    VL_POWSS_QQQ(obits, lbits, rbits, lhs, rhs, lsign, rsign)
+#define VL_POWSS_WWI(obits, lbits, rbits, owp, lwp, rhs, lsign, rsign) \
+    VL_POWSS_WWQ(obits, lbits, rbits, owp, lwp, rhs, lsign, rsign)
+
+static inline IData VL_POWSS_III(int obits, int, int rbits, IData lhs, IData rhs, bool lsign,
+                                 bool rsign) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs == 0)) return 1;
+    if (rsign && VL_SIGN_I(rbits, rhs)) {
+        if (lhs == 0) {
+            return 0;  // "X"
+        } else if (lhs == 1) {
+            return 1;
+        } else if (lsign && lhs == VL_MASK_I(obits)) {  // -1
+            if (rhs & 1) {
+                return VL_MASK_I(obits);  // -1^odd=-1
+            } else {
+                return 1;  // -1^even=1
+            }
+        }
+        return 0;
+    }
+    return VL_POW_III(obits, rbits, rbits, lhs, rhs);
+}
+static inline QData VL_POWSS_QQQ(int obits, int, int rbits, QData lhs, QData rhs, bool lsign,
+                                 bool rsign) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs == 0)) return 1;
+    if (rsign && VL_SIGN_Q(rbits, rhs)) {
+        if (lhs == 0) {
+            return 0;  // "X"
+        } else if (lhs == 1) {
+            return 1;
+        } else if (lsign && lhs == VL_MASK_Q(obits)) {  // -1
+            if (rhs & 1) {
+                return VL_MASK_Q(obits);  // -1^odd=-1
+            } else {
+                return 1;  // -1^even=1
+            }
+        }
+        return 0;
+    }
+    return VL_POW_QQQ(obits, rbits, rbits, lhs, rhs);
+}
+WDataOutP VL_POWSS_WWW(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp,
+                       WDataInP const rwp, bool lsign, bool rsign);
+WDataOutP VL_POWSS_WWQ(int obits, int, int rbits, WDataOutP owp, WDataInP const lwp, QData rhs,
+                       bool lsign, bool rsign);
+QData VL_POWSS_QQW(int obits, int, int rbits, QData lhs, WDataInP const rwp, bool lsign,
+                   bool rsign);
+
+//===================================================================
+// Concat/replication
+
+// INTERNAL: Stuff LHS bit 0++ into OUTPUT at specified offset
+// ld may be "dirty", output is clean
+static inline void _vl_insert_II(int, CData& lhsr, IData ld, int hbit, int lbit,
+                                 int rbits) VL_PURE {
+    const IData cleanmask = VL_MASK_I(rbits);
+    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
+    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
+}
+static inline void _vl_insert_II(int, SData& lhsr, IData ld, int hbit, int lbit,
+                                 int rbits) VL_PURE {
+    const IData cleanmask = VL_MASK_I(rbits);
+    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
+    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
+}
+static inline void _vl_insert_II(int, IData& lhsr, IData ld, int hbit, int lbit,
+                                 int rbits) VL_PURE {
+    const IData cleanmask = VL_MASK_I(rbits);
+    const IData insmask = (VL_MASK_I(hbit - lbit + 1)) << lbit;
+    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
+}
+static inline void _vl_insert_QQ(int, QData& lhsr, QData ld, int hbit, int lbit,
+                                 int rbits) VL_PURE {
+    const QData cleanmask = VL_MASK_Q(rbits);
+    const QData insmask = (VL_MASK_Q(hbit - lbit + 1)) << lbit;
+    lhsr = (lhsr & ~insmask) | ((ld << lbit) & (insmask & cleanmask));
+}
+static inline void _vl_insert_WI(int, WDataOutP owp, IData ld, int hbit, int lbit,
+                                 int rbits = 0) VL_MT_SAFE {
+    const int hoffset = VL_BITBIT_E(hbit);
+    const int loffset = VL_BITBIT_E(lbit);
+    const int roffset = VL_BITBIT_E(rbits);
+    const int hword = VL_BITWORD_E(hbit);
+    const int lword = VL_BITWORD_E(lbit);
+    const int rword = VL_BITWORD_E(rbits);
+    const EData cleanmask = hword == rword ? VL_MASK_E(roffset) : VL_MASK_E(0);
+
+    if (hoffset == VL_SIZEBITS_E && loffset == 0) {
+        // Fast and common case, word based insertion
+        owp[VL_BITWORD_E(lbit)] = ld & cleanmask;
+    } else {
+        const EData lde = static_cast<EData>(ld);
+        if (hword == lword) {  // know < EData bits because above checks it
+            // Assignment is contained within one word of destination
+            const EData insmask = (VL_MASK_E(hoffset - loffset + 1)) << loffset;
+            owp[lword] = (owp[lword] & ~insmask) | ((lde << loffset) & (insmask & cleanmask));
+        } else {
+            // Assignment crosses a word boundary in destination
+            const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1)) << 0;
+            const EData linsmask = (VL_MASK_E((VL_EDATASIZE - 1) - loffset + 1)) << loffset;
+            const int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword
+            owp[lword] = (owp[lword] & ~linsmask) | ((lde << loffset) & linsmask);
+            owp[hword]
+                = (owp[hword] & ~hinsmask) | ((lde >> nbitsonright) & (hinsmask & cleanmask));
+        }
+    }
+}
+
+// INTERNAL: Stuff large LHS bit 0++ into OUTPUT at specified offset
+// lwp may be "dirty"
+static inline void _vl_insert_WW(int, WDataOutP owp, WDataInP const lwp, int hbit, int lbit,
+                                 int rbits = 0) VL_MT_SAFE {
+    const int hoffset = VL_BITBIT_E(hbit);
+    const int loffset = VL_BITBIT_E(lbit);
+    const int roffset = VL_BITBIT_E(rbits);
+    const int lword = VL_BITWORD_E(lbit);
+    const int hword = VL_BITWORD_E(hbit);
+    const int rword = VL_BITWORD_E(rbits);
+    const int words = VL_WORDS_I(hbit - lbit + 1);
+    // Cleaning mask, only applied to top word of the assignment.  Is a no-op
+    // if we don't assign to the top word of the destination.
+    const EData cleanmask = hword == rword ? VL_MASK_E(roffset) : VL_MASK_E(0);
+
+    if (hoffset == VL_SIZEBITS_E && loffset == 0) {
+        // Fast and common case, word based insertion
+        for (int i = 0; i < (words - 1); ++i) owp[lword + i] = lwp[i];
+        owp[hword] = lwp[words - 1] & cleanmask;
+    } else if (loffset == 0) {
+        // Non-32bit, but nicely aligned, so stuff all but the last word
+        for (int i = 0; i < (words - 1); ++i) owp[lword + i] = lwp[i];
+        // Know it's not a full word as above fast case handled it
+        const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1));
+        owp[hword] = (owp[hword] & ~hinsmask) | (lwp[words - 1] & (hinsmask & cleanmask));
+    } else {
+        const EData hinsmask = (VL_MASK_E(hoffset - 0 + 1)) << 0;
+        const EData linsmask = (VL_MASK_E((VL_EDATASIZE - 1) - loffset + 1)) << loffset;
+        const int nbitsonright
+            = VL_EDATASIZE - loffset;  // bits that end up in lword (know loffset!=0)
+        // Middle words
+        for (int i = 0; i < words; ++i) {
+            {  // Lower word
+                const int oword = lword + i;
+                const EData d = lwp[i] << loffset;
+                const EData od = (owp[oword] & ~linsmask) | (d & linsmask);
+                if (oword == hword) {
+                    owp[oword] = (owp[oword] & ~hinsmask) | (od & (hinsmask & cleanmask));
+                } else {
+                    owp[oword] = od;
+                }
+            }
+            {  // Upper word
+                const int oword = lword + i + 1;
+                if (oword <= hword) {
+                    const EData d = lwp[i] >> nbitsonright;
+                    const EData od = (d & ~linsmask) | (owp[oword] & linsmask);
+                    if (oword == hword) {
+                        owp[oword] = (owp[oword] & ~hinsmask) | (od & (hinsmask & cleanmask));
+                    } else {
+                        owp[oword] = od;
+                    }
+                }
+            }
+        }
+    }
+}
+
+static inline void _vl_insert_WQ(int obits, WDataOutP owp, QData ld, int hbit, int lbit,
+                                 int rbits = 0) VL_MT_SAFE {
+    VlWide<VL_WQ_WORDS_E> lwp;
+    VL_SET_WQ(lwp, ld);
+    _vl_insert_WW(obits, owp, lwp, hbit, lbit, rbits);
+}
+
+// EMIT_RULE: VL_REPLICATE:  oclean=clean>width32, dirty<=width32; lclean=clean; rclean==clean;
+// RHS MUST BE CLEAN CONSTANT.
+#define VL_REPLICATE_IOI(obits, lbits, rbits, ld, rep) (-(ld))  // Iff lbits==1
+#define VL_REPLICATE_QOI(obits, lbits, rbits, ld, rep) (-(static_cast<QData>(ld)))  // Iff lbits==1
+
+static inline IData VL_REPLICATE_III(int, int lbits, int, IData ld, IData rep) VL_PURE {
+    IData returndata = ld;
+    for (unsigned i = 1; i < rep; ++i) {
+        returndata = returndata << lbits;
+        returndata |= ld;
+    }
+    return returndata;
+}
+static inline QData VL_REPLICATE_QII(int, int lbits, int, IData ld, IData rep) VL_PURE {
+    QData returndata = ld;
+    for (unsigned i = 1; i < rep; ++i) {
+        returndata = returndata << lbits;
+        returndata |= static_cast<QData>(ld);
+    }
+    return returndata;
+}
+static inline WDataOutP VL_REPLICATE_WII(int obits, int lbits, int, WDataOutP owp, IData ld,
+                                         IData rep) VL_MT_SAFE {
+    owp[0] = ld;
+    for (unsigned i = 1; i < rep; ++i) {
+        _vl_insert_WI(obits, owp, ld, i * lbits + lbits - 1, i * lbits);
+    }
+    return owp;
+}
+static inline WDataOutP VL_REPLICATE_WQI(int obits, int lbits, int, WDataOutP owp, QData ld,
+                                         IData rep) VL_MT_SAFE {
+    VL_SET_WQ(owp, ld);
+    for (unsigned i = 1; i < rep; ++i) {
+        _vl_insert_WQ(obits, owp, ld, i * lbits + lbits - 1, i * lbits);
+    }
+    return owp;
+}
+static inline WDataOutP VL_REPLICATE_WWI(int obits, int lbits, int, WDataOutP owp,
+                                         WDataInP const lwp, IData rep) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(lbits); ++i) owp[i] = lwp[i];
+    for (unsigned i = 1; i < rep; ++i) {
+        _vl_insert_WW(obits, owp, lwp, i * lbits + lbits - 1, i * lbits);
+    }
+    return owp;
+}
+
+// Left stream operator. Output will always be clean. LHS and RHS must be clean.
+// Special "fast" versions for slice sizes that are a power of 2. These use
+// shifts and masks to execute faster than the slower for-loop approach where a
+// subset of bits is copied in during each iteration.
+static inline IData VL_STREAML_FAST_III(int, int lbits, int, IData ld, IData rd_log2) VL_PURE {
+    // Pre-shift bits in most-significant slice:
+    //
+    // If lbits is not a multiple of the slice size (i.e., lbits % rd != 0),
+    // then we end up with a "gap" in our reversed result. For example, if we
+    // have a 5-bit Verlilog signal (lbits=5) in an 8-bit C data type:
+    //
+    //   ld = ---43210
+    //
+    // (where numbers are the Verilog signal bit numbers and '-' is an unused bit).
+    // Executing the switch statement below with a slice size of two (rd=2,
+    // rd_log2=1) produces:
+    //
+    //   ret = 1032-400
+    //
+    // Pre-shifting the bits in the most-significant slice allows us to avoid
+    // this gap in the shuffled data:
+    //
+    //   ld_adjusted = --4-3210
+    //   ret = 10324---
+    IData ret = ld;
+    if (rd_log2) {
+        const vluint32_t lbitsFloor = lbits & ~VL_MASK_I(rd_log2);  // max multiple of rd <= lbits
+        const vluint32_t lbitsRem = lbits - lbitsFloor;  // number of bits in most-sig slice (MSS)
+        const IData msbMask = VL_MASK_I(lbitsRem) << lbitsFloor;  // mask to sel only bits in MSS
+        ret = (ret & ~msbMask) | ((ret & msbMask) << ((VL_UL(1) << rd_log2) - lbitsRem));
+    }
+    switch (rd_log2) {
+    case 0: ret = ((ret >> 1) & VL_UL(0x55555555)) | ((ret & VL_UL(0x55555555)) << 1);  // FALLTHRU
+    case 1: ret = ((ret >> 2) & VL_UL(0x33333333)) | ((ret & VL_UL(0x33333333)) << 2);  // FALLTHRU
+    case 2: ret = ((ret >> 4) & VL_UL(0x0f0f0f0f)) | ((ret & VL_UL(0x0f0f0f0f)) << 4);  // FALLTHRU
+    case 3: ret = ((ret >> 8) & VL_UL(0x00ff00ff)) | ((ret & VL_UL(0x00ff00ff)) << 8);  // FALLTHRU
+    case 4: ret = ((ret >> 16) | (ret << 16));  // FALLTHRU
+    default:;
+    }
+    return ret >> (VL_IDATASIZE - lbits);
+}
+
+static inline QData VL_STREAML_FAST_QQI(int, int lbits, int, QData ld, IData rd_log2) VL_PURE {
+    // Pre-shift bits in most-significant slice (see comment in VL_STREAML_FAST_III)
+    QData ret = ld;
+    if (rd_log2) {
+        const vluint32_t lbitsFloor = lbits & ~VL_MASK_I(rd_log2);
+        const vluint32_t lbitsRem = lbits - lbitsFloor;
+        const QData msbMask = VL_MASK_Q(lbitsRem) << lbitsFloor;
+        ret = (ret & ~msbMask) | ((ret & msbMask) << ((1ULL << rd_log2) - lbitsRem));
+    }
+    switch (rd_log2) {
+    case 0:
+        ret = (((ret >> 1) & 0x5555555555555555ULL)
+               | ((ret & 0x5555555555555555ULL) << 1));  // FALLTHRU
+    case 1:
+        ret = (((ret >> 2) & 0x3333333333333333ULL)
+               | ((ret & 0x3333333333333333ULL) << 2));  // FALLTHRU
+    case 2:
+        ret = (((ret >> 4) & 0x0f0f0f0f0f0f0f0fULL)
+               | ((ret & 0x0f0f0f0f0f0f0f0fULL) << 4));  // FALLTHRU
+    case 3:
+        ret = (((ret >> 8) & 0x00ff00ff00ff00ffULL)
+               | ((ret & 0x00ff00ff00ff00ffULL) << 8));  // FALLTHRU
+    case 4:
+        ret = (((ret >> 16) & 0x0000ffff0000ffffULL)
+               | ((ret & 0x0000ffff0000ffffULL) << 16));  // FALLTHRU
+    case 5: ret = ((ret >> 32) | (ret << 32));  // FALLTHRU
+    default:;
+    }
+    return ret >> (VL_QUADSIZE - lbits);
+}
+
+// Regular "slow" streaming operators
+static inline IData VL_STREAML_III(int, int lbits, int, IData ld, IData rd) VL_PURE {
+    IData ret = 0;
+    // Slice size should never exceed the lhs width
+    const IData mask = VL_MASK_I(rd);
+    for (int istart = 0; istart < lbits; istart += rd) {
+        int ostart = lbits - rd - istart;
+        ostart = ostart > 0 ? ostart : 0;
+        ret |= ((ld >> istart) & mask) << ostart;
+    }
+    return ret;
+}
+
+static inline QData VL_STREAML_QQI(int, int lbits, int, QData ld, IData rd) VL_PURE {
+    QData ret = 0;
+    // Slice size should never exceed the lhs width
+    const QData mask = VL_MASK_Q(rd);
+    for (int istart = 0; istart < lbits; istart += rd) {
+        int ostart = lbits - rd - istart;
+        ostart = ostart > 0 ? ostart : 0;
+        ret |= ((ld >> istart) & mask) << ostart;
+    }
+    return ret;
+}
+
+static inline WDataOutP VL_STREAML_WWI(int, int lbits, int, WDataOutP owp, WDataInP const lwp,
+                                       IData rd) VL_MT_SAFE {
+    VL_ZERO_W(lbits, owp);
+    // Slice size should never exceed the lhs width
+    const int ssize = (rd < static_cast<IData>(lbits)) ? rd : (static_cast<IData>(lbits));
+    for (int istart = 0; istart < lbits; istart += rd) {
+        int ostart = lbits - rd - istart;
+        ostart = ostart > 0 ? ostart : 0;
+        for (int sbit = 0; sbit < ssize && sbit < lbits - istart; ++sbit) {
+            // Extract a single bit from lwp and shift it to the correct
+            // location for owp.
+            EData bit = (VL_BITRSHIFT_W(lwp, (istart + sbit)) & 1) << VL_BITBIT_E(ostart + sbit);
+            owp[VL_BITWORD_E(ostart + sbit)] |= bit;
+        }
+    }
+    return owp;
+}
+
+// Because concats are common and wide, it's valuable to always have a clean output.
+// Thus we specify inputs must be clean, so we don't need to clean the output.
+// Note the bit shifts are always constants, so the adds in these constify out.
+// Casts required, as args may be 8 bit entities, and need to shift to appropriate output size
+#define VL_CONCAT_III(obits, lbits, rbits, ld, rd) \
+    (static_cast<IData>(ld) << (rbits) | static_cast<IData>(rd))
+#define VL_CONCAT_QII(obits, lbits, rbits, ld, rd) \
+    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
+#define VL_CONCAT_QIQ(obits, lbits, rbits, ld, rd) \
+    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
+#define VL_CONCAT_QQI(obits, lbits, rbits, ld, rd) \
+    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
+#define VL_CONCAT_QQQ(obits, lbits, rbits, ld, rd) \
+    (static_cast<QData>(ld) << (rbits) | static_cast<QData>(rd))
+
+static inline WDataOutP VL_CONCAT_WII(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
+                                      IData rd) VL_MT_SAFE {
+    owp[0] = rd;
+    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WWI(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, IData rd) VL_MT_SAFE {
+    owp[0] = rd;
+    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WIW(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
+                                      WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
+    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WIQ(int obits, int lbits, int rbits, WDataOutP owp, IData ld,
+                                      QData rd) VL_MT_SAFE {
+    VL_SET_WQ(owp, rd);
+    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WI(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WQI(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
+                                      IData rd) VL_MT_SAFE {
+    owp[0] = rd;
+    for (int i = 1; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WQQ(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
+                                      QData rd) VL_MT_SAFE {
+    VL_SET_WQ(owp, rd);
+    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
+    VL_SET_WQ(owp, rd);
+    for (int i = VL_WQ_WORDS_E; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WQW(int obits, int lbits, int rbits, WDataOutP owp, QData ld,
+                                      WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
+    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WQ(obits, owp, ld, rbits + lbits - 1, rbits);
+    return owp;
+}
+static inline WDataOutP VL_CONCAT_WWW(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 0; i < VL_WORDS_I(rbits); ++i) owp[i] = rwp[i];
+    for (int i = VL_WORDS_I(rbits); i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    _vl_insert_WW(obits, owp, lwp, rbits + lbits - 1, rbits);
+    return owp;
+}
+
+//===================================================================
+// Shifts
+
+// Static shift, used by internal functions
+// The output is the same as the input - it overlaps!
+static inline void _vl_shiftl_inplace_w(int obits, WDataOutP iowp,
+                                        IData rd /*1 or 4*/) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    const EData linsmask = VL_MASK_E(rd);
+    for (int i = words - 1; i >= 1; --i) {
+        iowp[i]
+            = ((iowp[i] << rd) & ~linsmask) | ((iowp[i - 1] >> (VL_EDATASIZE - rd)) & linsmask);
+    }
+    iowp[0] = ((iowp[0] << rd) & ~linsmask);
+    iowp[VL_WORDS_I(obits) - 1] &= VL_MASK_E(obits);
+}
+
+// EMIT_RULE: VL_SHIFTL:  oclean=lclean; rclean==clean;
+// Important: Unlike most other funcs, the shift might well be a computed
+// expression.  Thus consider this when optimizing.  (And perhaps have 2 funcs?)
+static inline WDataOutP VL_SHIFTL_WWI(int obits, int, int, WDataOutP owp, WDataInP const lwp,
+                                      IData rd) VL_MT_SAFE {
+    const int word_shift = VL_BITWORD_E(rd);
+    const int bit_shift = VL_BITBIT_E(rd);
+    if (rd >= static_cast<IData>(obits)) {  // rd may be huge with MSB set
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    } else if (bit_shift == 0) {  // Aligned word shift (<<0,<<32,<<64 etc)
+        for (int i = 0; i < word_shift; ++i) owp[i] = 0;
+        for (int i = word_shift; i < VL_WORDS_I(obits); ++i) owp[i] = lwp[i - word_shift];
+    } else {
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+        _vl_insert_WW(obits, owp, lwp, obits - 1, rd);
+    }
+    return owp;
+}
+static inline WDataOutP VL_SHIFTL_WWW(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return VL_ZERO_W(obits, owp);
+        }
+    }
+    return VL_SHIFTL_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
+}
+static inline WDataOutP VL_SHIFTL_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
+    VlWide<VL_WQ_WORDS_E> rwp;
+    VL_SET_WQ(rwp, rd);
+    return VL_SHIFTL_WWW(obits, lbits, rbits, owp, lwp, rwp);
+}
+static inline IData VL_SHIFTL_IIW(int obits, int, int rbits, IData lhs,
+                                  WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return 0;
+        }
+    }
+    return VL_CLEAN_II(obits, obits, lhs << rwp[0]);
+}
+static inline IData VL_SHIFTL_IIQ(int obits, int, int, IData lhs, QData rhs) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs >= VL_IDATASIZE)) return 0;
+    return VL_CLEAN_II(obits, obits, lhs << rhs);
+}
+static inline QData VL_SHIFTL_QQW(int obits, int, int rbits, QData lhs,
+                                  WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return 0;
+        }
+    }
+    // Above checks rwp[1]==0 so not needed in below shift
+    return VL_CLEAN_QQ(obits, obits, lhs << (static_cast<QData>(rwp[0])));
+}
+static inline QData VL_SHIFTL_QQQ(int obits, int, int, QData lhs, QData rhs) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs >= VL_QUADSIZE)) return 0;
+    return VL_CLEAN_QQ(obits, obits, lhs << rhs);
+}
+
+// EMIT_RULE: VL_SHIFTR:  oclean=lclean; rclean==clean;
+// Important: Unlike most other funcs, the shift might well be a computed
+// expression.  Thus consider this when optimizing.  (And perhaps have 2 funcs?)
+static inline WDataOutP VL_SHIFTR_WWI(int obits, int, int, WDataOutP owp, WDataInP const lwp,
+                                      IData rd) VL_MT_SAFE {
+    const int word_shift = VL_BITWORD_E(rd);  // Maybe 0
+    const int bit_shift = VL_BITBIT_E(rd);
+    if (rd >= static_cast<IData>(obits)) {  // rd may be huge with MSB set
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    } else if (bit_shift == 0) {  // Aligned word shift (>>0,>>32,>>64 etc)
+        const int copy_words = (VL_WORDS_I(obits) - word_shift);
+        for (int i = 0; i < copy_words; ++i) owp[i] = lwp[i + word_shift];
+        for (int i = copy_words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    } else {
+        const int loffset = rd & VL_SIZEBITS_E;
+        const int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword (know
+                                                          // loffset!=0) Middle words
+        const int words = VL_WORDS_I(obits - rd);
+        for (int i = 0; i < words; ++i) {
+            owp[i] = lwp[i + word_shift] >> loffset;
+            const int upperword = i + word_shift + 1;
+            if (upperword < VL_WORDS_I(obits)) owp[i] |= lwp[upperword] << nbitsonright;
+        }
+        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    }
+    return owp;
+}
+static inline WDataOutP VL_SHIFTR_WWW(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return VL_ZERO_W(obits, owp);
+        }
+    }
+    return VL_SHIFTR_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
+}
+static inline WDataOutP VL_SHIFTR_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
+                                      WDataInP const lwp, QData rd) VL_MT_SAFE {
+    VlWide<VL_WQ_WORDS_E> rwp;
+    VL_SET_WQ(rwp, rd);
+    return VL_SHIFTR_WWW(obits, lbits, rbits, owp, lwp, rwp);
+}
+
+static inline IData VL_SHIFTR_IIW(int obits, int, int rbits, IData lhs,
+                                  WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return 0;
+        }
+    }
+    return VL_CLEAN_II(obits, obits, lhs >> rwp[0]);
+}
+static inline QData VL_SHIFTR_QQW(int obits, int, int rbits, QData lhs,
+                                  WDataInP const rwp) VL_MT_SAFE {
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) {
+        if (VL_UNLIKELY(rwp[i])) {  // Huge shift 1>>32 or more
+            return 0;
+        }
+    }
+    // Above checks rwp[1]==0 so not needed in below shift
+    return VL_CLEAN_QQ(obits, obits, lhs >> (static_cast<QData>(rwp[0])));
+}
+static inline IData VL_SHIFTR_IIQ(int obits, int, int, IData lhs, QData rhs) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs >= VL_IDATASIZE)) return 0;
+    return VL_CLEAN_QQ(obits, obits, lhs >> rhs);
+}
+static inline QData VL_SHIFTR_QQQ(int obits, int, int, QData lhs, QData rhs) VL_MT_SAFE {
+    if (VL_UNLIKELY(rhs >= VL_QUADSIZE)) return 0;
+    return VL_CLEAN_QQ(obits, obits, lhs >> rhs);
+}
+
+// EMIT_RULE: VL_SHIFTRS:  oclean=false; lclean=clean, rclean==clean;
+static inline IData VL_SHIFTRS_III(int obits, int lbits, int, IData lhs, IData rhs) VL_PURE {
+    // Note the C standard does not specify the >> operator as a arithmetic shift!
+    // IEEE says signed if output signed, but bit position from lbits;
+    // must use lbits for sign; lbits might != obits,
+    // an EXTEND(SHIFTRS(...)) can became a SHIFTRS(...) within same 32/64 bit word length
+    const IData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
+    const IData signext = ~(VL_MASK_I(lbits) >> rhs);  // One with bits where we've shifted "past"
+    return (lhs >> rhs) | (sign & VL_CLEAN_II(obits, obits, signext));
+}
+static inline QData VL_SHIFTRS_QQI(int obits, int lbits, int, QData lhs, IData rhs) VL_PURE {
+    const QData sign = -(lhs >> (lbits - 1));
+    const QData signext = ~(VL_MASK_Q(lbits) >> rhs);
+    return (lhs >> rhs) | (sign & VL_CLEAN_QQ(obits, obits, signext));
+}
+static inline IData VL_SHIFTRS_IQI(int obits, int lbits, int rbits, QData lhs, IData rhs) VL_PURE {
+    return static_cast<IData>(VL_SHIFTRS_QQI(obits, lbits, rbits, lhs, rhs));
+}
+static inline WDataOutP VL_SHIFTRS_WWI(int obits, int lbits, int, WDataOutP owp,
+                                       WDataInP const lwp, IData rd) VL_MT_SAFE {
+    const int word_shift = VL_BITWORD_E(rd);
+    const int bit_shift = VL_BITBIT_E(rd);
+    const int lmsw = VL_WORDS_I(obits) - 1;
+    const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
+    if (rd >= static_cast<IData>(obits)) {  // Shifting past end, sign in all of lbits
+        for (int i = 0; i <= lmsw; ++i) owp[i] = sign;
+        owp[lmsw] &= VL_MASK_E(lbits);
+    } else if (bit_shift == 0) {  // Aligned word shift (>>0,>>32,>>64 etc)
+        const int copy_words = (VL_WORDS_I(obits) - word_shift);
+        for (int i = 0; i < copy_words; ++i) owp[i] = lwp[i + word_shift];
+        if (copy_words >= 0) owp[copy_words - 1] |= ~VL_MASK_E(obits) & sign;
+        for (int i = copy_words; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
+        owp[lmsw] &= VL_MASK_E(lbits);
+    } else {
+        const int loffset = rd & VL_SIZEBITS_E;
+        int nbitsonright = VL_EDATASIZE - loffset;  // bits that end up in lword (know loffset!=0)
+        // Middle words
+        const int words = VL_WORDS_I(obits - rd);
+        for (int i = 0; i < words; ++i) {
+            owp[i] = lwp[i + word_shift] >> loffset;
+            const int upperword = i + word_shift + 1;
+            if (upperword < VL_WORDS_I(obits)) owp[i] |= lwp[upperword] << nbitsonright;
+        }
+        if (words) owp[words - 1] |= sign & ~VL_MASK_E(obits - loffset);
+        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = sign;
+        owp[lmsw] &= VL_MASK_E(lbits);
+    }
+    return owp;
+}
+static inline WDataOutP VL_SHIFTRS_WWW(int obits, int lbits, int rbits, WDataOutP owp,
+                                       WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE {
+    EData overshift = 0;  // Huge shift 1>>32 or more
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
+    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
+        const int lmsw = VL_WORDS_I(obits) - 1;
+        const EData sign = VL_SIGNONES_E(lbits, lwp[lmsw]);
+        for (int j = 0; j <= lmsw; ++j) owp[j] = sign;
+        owp[lmsw] &= VL_MASK_E(lbits);
+        return owp;
+    }
+    return VL_SHIFTRS_WWI(obits, lbits, 32, owp, lwp, rwp[0]);
+}
+static inline WDataOutP VL_SHIFTRS_WWQ(int obits, int lbits, int rbits, WDataOutP owp,
+                                       WDataInP const lwp, QData rd) VL_MT_SAFE {
+    VlWide<VL_WQ_WORDS_E> rwp;
+    VL_SET_WQ(rwp, rd);
+    return VL_SHIFTRS_WWW(obits, lbits, rbits, owp, lwp, rwp);
+}
+static inline IData VL_SHIFTRS_IIW(int obits, int lbits, int rbits, IData lhs,
+                                   WDataInP const rwp) VL_MT_SAFE {
+    EData overshift = 0;  // Huge shift 1>>32 or more
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
+    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
+        const IData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
+        return VL_CLEAN_II(obits, obits, sign);
+    }
+    return VL_SHIFTRS_III(obits, lbits, 32, lhs, rwp[0]);
+}
+static inline QData VL_SHIFTRS_QQW(int obits, int lbits, int rbits, QData lhs,
+                                   WDataInP const rwp) VL_MT_SAFE {
+    EData overshift = 0;  // Huge shift 1>>32 or more
+    for (int i = 1; i < VL_WORDS_I(rbits); ++i) overshift |= rwp[i];
+    if (VL_UNLIKELY(overshift || rwp[0] >= obits)) {
+        const QData sign = -(lhs >> (lbits - 1));  // ffff_ffff if negative
+        return VL_CLEAN_QQ(obits, obits, sign);
+    }
+    return VL_SHIFTRS_QQI(obits, lbits, 32, lhs, rwp[0]);
+}
+static inline IData VL_SHIFTRS_IIQ(int obits, int lbits, int rbits, IData lhs,
+                                   QData rhs) VL_MT_SAFE {
+    VlWide<VL_WQ_WORDS_E> rwp;
+    VL_SET_WQ(rwp, rhs);
+    return VL_SHIFTRS_IIW(obits, lbits, rbits, lhs, rwp);
+}
+static inline QData VL_SHIFTRS_QQQ(int obits, int lbits, int rbits, QData lhs, QData rhs) VL_PURE {
+    VlWide<VL_WQ_WORDS_E> rwp;
+    VL_SET_WQ(rwp, rhs);
+    return VL_SHIFTRS_QQW(obits, lbits, rbits, lhs, rwp);
+}
+
+//===================================================================
+// Bit selection
+
+// EMIT_RULE: VL_BITSEL:  oclean=dirty; rclean==clean;
+#define VL_BITSEL_IIII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
+#define VL_BITSEL_QIII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
+#define VL_BITSEL_QQII(obits, lbits, rbits, zbits, lhs, rhs) ((lhs) >> (rhs))
+#define VL_BITSEL_IQII(obits, lbits, rbits, zbits, lhs, rhs) (static_cast<IData>((lhs) >> (rhs)))
+
+static inline IData VL_BITSEL_IWII(int, int lbits, int, int, WDataInP const lwp,
+                                   IData rd) VL_MT_SAFE {
+    int word = VL_BITWORD_E(rd);
+    if (VL_UNLIKELY(rd > static_cast<IData>(lbits))) {
+        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
+        // We return all 1's as that's more likely to find bugs (?) than 0's.
+    } else {
+        return (lwp[word] >> VL_BITBIT_E(rd));
+    }
+}
+
+// EMIT_RULE: VL_RANGE:  oclean=lclean;  out=dirty
+// <msb> & <lsb> MUST BE CLEAN (currently constant)
+#define VL_SEL_IIII(obits, lbits, rbits, tbits, lhs, lsb, width) ((lhs) >> (lsb))
+#define VL_SEL_QQII(obits, lbits, rbits, tbits, lhs, lsb, width) ((lhs) >> (lsb))
+#define VL_SEL_IQII(obits, lbits, rbits, tbits, lhs, lsb, width) \
+    (static_cast<IData>((lhs) >> (lsb)))
+
+static inline IData VL_SEL_IWII(int, int lbits, int, int, WDataInP const lwp, IData lsb,
+                                IData width) VL_MT_SAFE {
+    int msb = lsb + width - 1;
+    if (VL_UNLIKELY(msb >= lbits)) {
+        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
+    } else if (VL_BITWORD_E(msb) == VL_BITWORD_E(static_cast<int>(lsb))) {
+        return VL_BITRSHIFT_W(lwp, lsb);
+    } else {
+        // 32 bit extraction may span two words
+        int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);  // bits that come from low word
+        return ((lwp[VL_BITWORD_E(msb)] << nbitsfromlow) | VL_BITRSHIFT_W(lwp, lsb));
+    }
+}
+
+static inline QData VL_SEL_QWII(int, int lbits, int, int, WDataInP const lwp, IData lsb,
+                                IData width) VL_MT_SAFE {
+    const int msb = lsb + width - 1;
+    if (VL_UNLIKELY(msb > lbits)) {
+        return ~0;  // Spec says you can go outside the range of a array.  Don't coredump if so.
+    } else if (VL_BITWORD_E(msb) == VL_BITWORD_E(static_cast<int>(lsb))) {
+        return VL_BITRSHIFT_W(lwp, lsb);
+    } else if (VL_BITWORD_E(msb) == 1 + VL_BITWORD_E(static_cast<int>(lsb))) {
+        const int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);
+        const QData hi = (lwp[VL_BITWORD_E(msb)]);
+        const QData lo = VL_BITRSHIFT_W(lwp, lsb);
+        return (hi << nbitsfromlow) | lo;
+    } else {
+        // 64 bit extraction may span three words
+        int nbitsfromlow = VL_EDATASIZE - VL_BITBIT_E(lsb);
+        const QData hi = (lwp[VL_BITWORD_E(msb)]);
+        const QData mid = (lwp[VL_BITWORD_E(lsb) + 1]);
+        const QData lo = VL_BITRSHIFT_W(lwp, lsb);
+        return (hi << (nbitsfromlow + VL_EDATASIZE)) | (mid << nbitsfromlow) | lo;
+    }
+}
+
+static inline WDataOutP VL_SEL_WWII(int obits, int lbits, int, int, WDataOutP owp,
+                                    WDataInP const lwp, IData lsb, IData width) VL_MT_SAFE {
+    const int msb = lsb + width - 1;
+    const int word_shift = VL_BITWORD_E(lsb);
+    if (VL_UNLIKELY(msb > lbits)) {  // Outside bounds,
+        for (int i = 0; i < VL_WORDS_I(obits) - 1; ++i) owp[i] = ~0;
+        owp[VL_WORDS_I(obits) - 1] = VL_MASK_E(obits);
+    } else if (VL_BITBIT_E(lsb) == 0) {
+        // Just a word extract
+        for (int i = 0; i < VL_WORDS_I(obits); ++i) owp[i] = lwp[i + word_shift];
+    } else {
+        // Not a _vl_insert because the bits come from any bit number and goto bit 0
+        const int loffset = lsb & VL_SIZEBITS_E;
+        const int nbitsfromlow = VL_EDATASIZE - loffset;  // bits that end up in lword (know
+                                                          // loffset!=0) Middle words
+        const int words = VL_WORDS_I(msb - lsb + 1);
+        for (int i = 0; i < words; ++i) {
+            owp[i] = lwp[i + word_shift] >> loffset;
+            const int upperword = i + word_shift + 1;
+            if (upperword <= static_cast<int>(VL_BITWORD_E(msb))) {
+                owp[i] |= lwp[upperword] << nbitsfromlow;
+            }
+        }
+        for (int i = words; i < VL_WORDS_I(obits); ++i) owp[i] = 0;
+    }
+    return owp;
+}
+
+//======================================================================
+// Math needing insert/select
+
+// Return QData from double (numeric)
+// EMIT_RULE: VL_RTOIROUND_Q_D:  oclean=dirty; lclean==clean/real
+static inline QData VL_RTOIROUND_Q_D(int, double lhs) VL_PURE {
+    // IEEE format: [63]=sign [62:52]=exp+1023 [51:0]=mantissa
+    // This does not need to support subnormals as they are sub-integral
+    lhs = VL_ROUND(lhs);
+    if (lhs == 0.0) return 0;
+    const QData q = VL_CVT_Q_D(lhs);
+    const int lsb = static_cast<int>((q >> 52ULL) & VL_MASK_Q(11)) - 1023 - 52;
+    const vluint64_t mantissa = (q & VL_MASK_Q(52)) | (1ULL << 52);
+    vluint64_t out = 0;
+    if (lsb < 0) {
+        out = mantissa >> -lsb;
+    } else if (lsb < 64) {
+        out = mantissa << lsb;
+    }
+    if (lhs < 0) out = -out;
+    return out;
+}
+static inline IData VL_RTOIROUND_I_D(int bits, double lhs) VL_PURE {
+    return static_cast<IData>(VL_RTOIROUND_Q_D(bits, lhs));
+}
+static inline WDataOutP VL_RTOIROUND_W_D(int obits, WDataOutP owp, double lhs) VL_PURE {
+    // IEEE format: [63]=sign [62:52]=exp+1023 [51:0]=mantissa
+    // This does not need to support subnormals as they are sub-integral
+    lhs = VL_ROUND(lhs);
+    VL_ZERO_W(obits, owp);
+    if (lhs == 0.0) return owp;
+    const QData q = VL_CVT_Q_D(lhs);
+    const int lsb = static_cast<int>((q >> 52ULL) & VL_MASK_Q(11)) - 1023 - 52;
+    const vluint64_t mantissa = (q & VL_MASK_Q(52)) | (1ULL << 52);
+    if (lsb < 0) {
+        VL_SET_WQ(owp, mantissa >> -lsb);
+    } else if (lsb < obits) {
+        _vl_insert_WQ(obits, owp, mantissa, lsb + 52, lsb);
+    }
+    if (lhs < 0) VL_NEGATE_INPLACE_W(VL_WORDS_I(obits), owp);
+    return owp;
+}
+
+//======================================================================
+// Range assignments
+
+// EMIT_RULE: VL_ASSIGNRANGE:  rclean=dirty;
+static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, CData& lhsr,
+                                     IData rhs) VL_PURE {
+    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, SData& lhsr,
+                                     IData rhs) VL_PURE {
+    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_IIII(int rbits, int obits, int lsb, IData& lhsr,
+                                     IData rhs) VL_PURE {
+    _vl_insert_II(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_QIII(int rbits, int obits, int lsb, QData& lhsr,
+                                     IData rhs) VL_PURE {
+    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_QQII(int rbits, int obits, int lsb, QData& lhsr,
+                                     QData rhs) VL_PURE {
+    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_QIIQ(int rbits, int obits, int lsb, QData& lhsr,
+                                     QData rhs) VL_PURE {
+    _vl_insert_QQ(obits, lhsr, rhs, lsb + obits - 1, lsb, rbits);
+}
+// static inline void VL_ASSIGNSEL_IIIW(int obits, int lsb, IData& lhsr, WDataInP const rwp)
+// VL_MT_SAFE { Illegal, as lhs width >= rhs width
+static inline void VL_ASSIGNSEL_WIII(int rbits, int obits, int lsb, WDataOutP owp,
+                                     IData rhs) VL_MT_SAFE {
+    _vl_insert_WI(obits, owp, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_WIIQ(int rbits, int obits, int lsb, WDataOutP owp,
+                                     QData rhs) VL_MT_SAFE {
+    _vl_insert_WQ(obits, owp, rhs, lsb + obits - 1, lsb, rbits);
+}
+static inline void VL_ASSIGNSEL_WIIW(int rbits, int obits, int lsb, WDataOutP owp,
+                                     WDataInP const rwp) VL_MT_SAFE {
+    _vl_insert_WW(obits, owp, rwp, lsb + obits - 1, lsb, rbits);
+}
+
+//======================================================================
+// Triops
+
+static inline WDataOutP VL_COND_WIWW(int obits, int, int, int, WDataOutP owp, int cond,
+                                     WDataInP const w1p, WDataInP const w2p) VL_MT_SAFE {
+    const int words = VL_WORDS_I(obits);
+    for (int i = 0; i < words; ++i) owp[i] = cond ? w1p[i] : w2p[i];
+    return owp;
+}
+
+//======================================================================
+// Constification
+
+// VL_CONST_W_#X(int obits, WDataOutP owp, IData data0, .... IData data(#-1))
+// Sets wide vector words to specified constant words.
+// These macros are used when o might represent more words then are given as constants,
+// hence all upper words must be zeroed.
+// If changing the number of functions here, also change EMITCINLINES_NUM_CONSTW
+
+#define VL_C_END_(obits, wordsSet) \
+    for (int i = (wordsSet); i < VL_WORDS_I(obits); ++i) o[i] = 0; \
+    return o
+
+// clang-format off
+static inline WDataOutP VL_CONST_W_1X(int obits, WDataOutP o, EData d0) VL_MT_SAFE {
+    o[0] = d0;
+    VL_C_END_(obits, 1);
+}
+static inline WDataOutP VL_CONST_W_2X(int obits, WDataOutP o, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;
+    VL_C_END_(obits, 2);
+}
+static inline WDataOutP VL_CONST_W_3X(int obits, WDataOutP o, EData d2, EData d1,
+                                      EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;
+    VL_C_END_(obits,3);
+}
+static inline WDataOutP VL_CONST_W_4X(int obits, WDataOutP o,
+                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    VL_C_END_(obits,4);
+}
+static inline WDataOutP VL_CONST_W_5X(int obits, WDataOutP o,
+                                      EData d4,
+                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;
+    VL_C_END_(obits,5);
+}
+static inline WDataOutP VL_CONST_W_6X(int obits, WDataOutP o,
+                                      EData d5, EData d4,
+                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;
+    VL_C_END_(obits,6);
+}
+static inline WDataOutP VL_CONST_W_7X(int obits, WDataOutP o,
+                                      EData d6, EData d5, EData d4,
+                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;  o[6] = d6;
+    VL_C_END_(obits,7);
+}
+static inline WDataOutP VL_CONST_W_8X(int obits, WDataOutP o,
+                                      EData d7, EData d6, EData d5, EData d4,
+                                      EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;  o[6] = d6;  o[7] = d7;
+    VL_C_END_(obits,8);
+}
+//
+static inline WDataOutP VL_CONSTHI_W_1X(int obits, int lsb, WDataOutP obase,
+                                        EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 1);
+}
+static inline WDataOutP VL_CONSTHI_W_2X(int obits, int lsb, WDataOutP obase,
+                                        EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 2);
+}
+static inline WDataOutP VL_CONSTHI_W_3X(int obits, int lsb, WDataOutP obase,
+                                        EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 3);
+}
+static inline WDataOutP VL_CONSTHI_W_4X(int obits, int lsb, WDataOutP obase,
+                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 4);
+}
+static inline WDataOutP VL_CONSTHI_W_5X(int obits, int lsb, WDataOutP obase,
+                                        EData d4,
+                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 5);
+}
+static inline WDataOutP VL_CONSTHI_W_6X(int obits, int lsb, WDataOutP obase,
+                                        EData d5, EData d4,
+                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 6);
+}
+static inline WDataOutP VL_CONSTHI_W_7X(int obits, int lsb, WDataOutP obase,
+                                        EData d6, EData d5, EData d4,
+                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;  o[6] = d6;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 7);
+}
+static inline WDataOutP VL_CONSTHI_W_8X(int obits, int lsb, WDataOutP obase,
+                                        EData d7, EData d6, EData d5, EData d4,
+                                        EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0;  o[1] = d1;  o[2] = d2;  o[3] = d3;
+    o[4] = d4;  o[5] = d5;  o[6] = d6;  o[7] = d7;
+    VL_C_END_(obits, VL_WORDS_I(lsb) + 8);
+}
+
+#undef VL_C_END_
+
+// Partial constant, lower words of vector wider than 8*32, starting at bit number lsb
+static inline void VL_CONSTLO_W_8X(int lsb, WDataOutP obase,
+                                   EData d7, EData d6, EData d5, EData d4,
+                                   EData d3, EData d2, EData d1, EData d0) VL_MT_SAFE {
+    WDataOutP o = obase + VL_WORDS_I(lsb);
+    o[0] = d0; o[1] = d1; o[2] = d2; o[3] = d3; o[4] = d4; o[5] = d5; o[6] = d6; o[7] = d7;
+}
+// clang-format on
+
+//======================================================================
+// Strings
+
+extern std::string VL_PUTC_N(const std::string& lhs, IData rhs, CData ths) VL_PURE;
+extern CData VL_GETC_N(const std::string& lhs, IData rhs) VL_PURE;
+extern std::string VL_SUBSTR_N(const std::string& lhs, IData rhs, IData ths) VL_PURE;
+
+inline IData VL_CMP_NN(const std::string& lhs, const std::string& rhs, bool ignoreCase) VL_PURE {
+    // SystemVerilog does not allow a string variable to contain '\0'.
+    // So C functions such as strcmp() can correctly compare strings.
+    if (ignoreCase) {
+        return VL_STRCASECMP(lhs.c_str(), rhs.c_str());
+    } else {
+        return std::strcmp(lhs.c_str(), rhs.c_str());
+    }
+}
+
+extern IData VL_ATOI_N(const std::string& str, int base) VL_PURE;
+
+extern IData VL_FGETS_NI(std::string& dest, IData fpi);
+
+//======================================================================
+// Conversion functions
+
+extern std::string VL_CVT_PACK_STR_NW(int lwords, const WDataInP lwp) VL_MT_SAFE;
+inline std::string VL_CVT_PACK_STR_NQ(QData lhs) VL_PURE {
+    VlWide<VL_WQ_WORDS_E> lw;
+    VL_SET_WQ(lw, lhs);
+    return VL_CVT_PACK_STR_NW(VL_WQ_WORDS_E, lw);
+}
+inline std::string VL_CVT_PACK_STR_NN(const std::string& lhs) VL_PURE { return lhs; }
+inline std::string& VL_CVT_PACK_STR_NN(std::string& lhs) VL_PURE { return lhs; }
+inline std::string VL_CVT_PACK_STR_NI(IData lhs) VL_PURE {
+    VlWide<VL_WQ_WORDS_E> lw;
+    VL_SET_WI(lw, lhs);
+    return VL_CVT_PACK_STR_NW(1, lw);
+}
+inline std::string VL_CONCATN_NNN(const std::string& lhs, const std::string& rhs) VL_PURE {
+    return lhs + rhs;
+}
+inline std::string VL_REPLICATEN_NNQ(int, int, int, const std::string& lhs, IData rep) VL_PURE {
+    std::string out;
+    out.reserve(lhs.length() * rep);
+    for (unsigned times = 0; times < rep; ++times) out += lhs;
+    return out;
+}
+inline std::string VL_REPLICATEN_NNI(int obits, int lbits, int rbits, const std::string& lhs,
+                                     IData rep) VL_PURE {
+    return VL_REPLICATEN_NNQ(obits, lbits, rbits, lhs, rep);
+}
+
+inline IData VL_LEN_IN(const std::string& ld) { return ld.length(); }
+extern std::string VL_TOLOWER_NN(const std::string& ld);
+extern std::string VL_TOUPPER_NN(const std::string& ld);
+
+extern IData VL_FERROR_IN(IData fpi, std::string& outputr) VL_MT_SAFE;
+extern IData VL_FOPEN_NN(const std::string& filename, const std::string& mode) VL_MT_SAFE;
+extern IData VL_FOPEN_MCD_N(const std::string& filename) VL_MT_SAFE;
+extern void VL_READMEM_N(bool hex, int bits, QData depth, int array_lsb,
+                         const std::string& filename, void* memp, QData start,
+                         QData end) VL_MT_SAFE;
+extern void VL_WRITEMEM_N(bool hex, int bits, QData depth, int array_lsb,
+                          const std::string& filename, const void* memp, QData start,
+                          QData end) VL_MT_SAFE;
+extern IData VL_SSCANF_INX(int lbits, const std::string& ld, const char* formatp, ...) VL_MT_SAFE;
+extern void VL_SFORMAT_X(int obits_ignored, std::string& output, const char* formatp,
+                         ...) VL_MT_SAFE;
+extern std::string VL_SFORMATF_NX(const char* formatp, ...) VL_MT_SAFE;
+extern void VL_TIMEFORMAT_IINI(int units, int precision, const std::string& suffix, int width,
+                               VerilatedContext* contextp) VL_MT_SAFE;
+extern IData VL_VALUEPLUSARGS_INW(int rbits, const std::string& ld, WDataOutP rwp) VL_MT_SAFE;
+inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, CData& rdr) VL_MT_SAFE {
+    VlWide<2> rwp;
+    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
+    if (got) rdr = rwp[0];
+    return got;
+}
+inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, SData& rdr) VL_MT_SAFE {
+    VlWide<2> rwp;
+    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
+    if (got) rdr = rwp[0];
+    return got;
+}
+inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, IData& rdr) VL_MT_SAFE {
+    VlWide<2> rwp;
+    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
+    if (got) rdr = rwp[0];
+    return got;
+}
+inline IData VL_VALUEPLUSARGS_INQ(int rbits, const std::string& ld, QData& rdr) VL_MT_SAFE {
+    VlWide<2> rwp;
+    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
+    if (got) rdr = VL_SET_QW(rwp);
+    return got;
+}
+inline IData VL_VALUEPLUSARGS_INQ(int rbits, const std::string& ld, double& rdr) VL_MT_SAFE {
+    VlWide<2> rwp;
+    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
+    if (got) rdr = VL_CVT_D_Q(VL_SET_QW(rwp));
+    return got;
+}
+extern IData VL_VALUEPLUSARGS_INN(int, const std::string& ld, std::string& rdr) VL_MT_SAFE;
+
+//======================================================================
+
+#endif  // Guard
diff --git a/include/verilated_heavy.h b/include/verilated_heavy.h
index 598a99400..80129772f 100644
--- a/include/verilated_heavy.h
+++ b/include/verilated_heavy.h
@@ -12,966 +12,20 @@
 //*************************************************************************
 ///
 /// \file
-/// \brief Verilated string and data-type header
+/// \brief Verilated old string and data-type header
 ///
-/// This file is included automatically by Verilator at the top of
-/// all C++ files it generates.  It is used when strings or other
-/// heavyweight types are required; these contents are not part of
-/// verilated.h to save compile time when such types aren't used.
+/// This file is deprecated, and provided for backwards compatibility.
+/// Include verilated.h instead.
 ///
 //*************************************************************************
 
 #ifndef VERILATOR_VERILATED_HEAVY_H_
 #define VERILATOR_VERILATED_HEAVY_H_
 
+#ifdef VL_NO_LEGACY
+#error "Include <verilated.h> instead of <verilated_heavy.h>"
+#endif
+
 #include "verilated.h"
 
-#include <algorithm>
-#include <array>
-#include <deque>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_set>
-
-//===================================================================
-// String formatters (required by below containers)
-
-extern std::string VL_TO_STRING(CData lhs);
-extern std::string VL_TO_STRING(SData lhs);
-extern std::string VL_TO_STRING(IData lhs);
-extern std::string VL_TO_STRING(QData lhs);
-inline std::string VL_TO_STRING(const std::string& obj) { return "\"" + obj + "\""; }
-extern std::string VL_TO_STRING_W(int words, const WDataInP obj);
-
-//===================================================================
-// Shuffle RNG
-
-class VlURNG final {
-public:
-    using result_type = size_t;
-    static constexpr size_t min() { return 0; }
-    static constexpr size_t max() { return 1ULL << 31; }
-    size_t operator()() { return VL_MASK_I(31) & VL_RANDOM_I(32); }
-};
-
-//===================================================================
-// Readmem/Writemem operation classes
-
-class VlReadMem final {
-    bool m_hex;  // Hex format
-    int m_bits;  // Bit width of values
-    const std::string& m_filename;  // Filename
-    QData m_end;  // End address (as specified by user)
-    FILE* m_fp;  // File handle for filename
-    QData m_addr;  // Next address to read
-    int m_linenum;  // Line number last read from file
-public:
-    VlReadMem(bool hex, int bits, const std::string& filename, QData start, QData end);
-    ~VlReadMem();
-    bool isOpen() const { return m_fp != nullptr; }
-    int linenum() const { return m_linenum; }
-    bool get(QData& addrr, std::string& valuer);
-    void setData(void* valuep, const std::string& rhs);
-};
-
-class VlWriteMem final {
-    bool m_hex;  // Hex format
-    int m_bits;  // Bit width of values
-    FILE* m_fp;  // File handle for filename
-    QData m_addr;  // Next address to write
-public:
-    VlWriteMem(bool hex, int bits, const std::string& filename, QData start, QData end);
-    ~VlWriteMem();
-    bool isOpen() const { return m_fp != nullptr; }
-    void print(QData addr, bool addrstamp, const void* valuep);
-};
-
-//===================================================================
-/// Verilog wide packed bit container.
-/// Similar to std::array<WData, N>, but lighter weight, only methods needed
-/// by Verilator, to help compile time.
-///
-/// A 'struct' as we want this to be an aggregate type that allows
-/// static aggregate initialization. Consider data members private.
-///
-/// For example a Verilog "bit [94:0]" will become a VlWide<3> because 3*32
-/// bits are needed to hold the 95 bits. The MSB (bit 96) must always be
-/// zero in memory, but during intermediate operations in the Verilated
-/// internals is unpredictable.
-
-template <std::size_t T_Words> struct VlWide final {
-    // MEMBERS
-    // This should be the only data member, otherwise generated static initializers need updating
-    EData m_storage[T_Words];  // Contents of the packed array
-
-    // CONSTRUCTORS
-    // Default constructors and destructor are used. Note however that C++20 requires that
-    // aggregate types do not have a user declared constructor, not even an explicitly defaulted
-    // one.
-
-    // OPERATOR METHODS
-    // Default copy assignment operators are used.
-    operator WDataOutP() { return &m_storage[0]; }  // This also allows []
-    operator WDataInP() const { return &m_storage[0]; }  // This also allows []
-
-    // METHODS
-    const EData& at(size_t index) const { return m_storage[index]; }
-    EData& at(size_t index) { return m_storage[index]; }
-    WData* data() { return &m_storage[0]; }
-    const WData* data() const { return &m_storage[0]; }
-    bool operator<(const VlWide<T_Words>& rhs) const {
-        return VL_LT_W(T_Words, data(), rhs.data());
-    }
-};
-
-// Convert a C array to std::array reference by pointer magic, without copy.
-// Data type (second argument) is so the function template can automatically generate.
-template <std::size_t T_Words>
-VlWide<T_Words>& VL_CVT_W_A(const WDataInP inp, const VlWide<T_Words>&) {
-    return *((VlWide<T_Words>*)inp);
-}
-
-template <std::size_t T_Words> std::string VL_TO_STRING(const VlWide<T_Words>& obj) {
-    return VL_TO_STRING_W(T_Words, obj.data());
-}
-
-//===================================================================
-// Verilog queue and dynamic array container
-// There are no multithreaded locks on this; the base variable must
-// be protected by other means
-//
-// Bound here is the maximum size() allowed, e.g. 1 + SystemVerilog bound
-// For dynamic arrays it is always zero
-template <class T_Value, size_t T_MaxSize = 0> class VlQueue final {
-private:
-    // TYPES
-    using Deque = std::deque<T_Value>;
-
-public:
-    using const_iterator = typename Deque::const_iterator;
-
-private:
-    // MEMBERS
-    Deque m_deque;  // State of the assoc array
-    T_Value m_defaultValue;  // Default value
-
-public:
-    // CONSTRUCTORS
-    // m_defaultValue isn't defaulted. Caller's constructor must do it.
-    VlQueue() = default;
-    ~VlQueue() = default;
-    VlQueue(const VlQueue&) = default;
-    VlQueue(VlQueue&&) = default;
-    VlQueue& operator=(const VlQueue&) = default;
-    VlQueue& operator=(VlQueue&&) = default;
-
-    // Standard copy constructor works. Verilog: assoca = assocb
-    // Also must allow conversion from a different T_MaxSize queue
-    template <size_t U_MaxSize = 0> VlQueue operator=(const VlQueue<T_Value, U_MaxSize>& rhs) {
-        m_deque = rhs.privateDeque();
-        if (VL_UNLIKELY(T_MaxSize && T_MaxSize < m_deque.size())) m_deque.resize(T_MaxSize - 1);
-        return *this;
-    }
-
-    static VlQueue cons(const T_Value& lhs) {
-        VlQueue out;
-        out.push_back(lhs);
-        return out;
-    }
-    static VlQueue cons(const T_Value& lhs, const T_Value& rhs) {
-        VlQueue out;
-        out.push_back(rhs);
-        out.push_back(lhs);
-        return out;
-    }
-    static VlQueue cons(const VlQueue& lhs, const T_Value& rhs) {
-        VlQueue out = lhs;
-        out.push_front(rhs);
-        return out;
-    }
-    static VlQueue cons(const T_Value& lhs, const VlQueue& rhs) {
-        VlQueue out = rhs;
-        out.push_back(lhs);
-        return out;
-    }
-    static VlQueue cons(const VlQueue& lhs, const VlQueue& rhs) {
-        VlQueue out = rhs;
-        for (const auto& i : lhs.m_deque) out.push_back(i);
-        return out;
-    }
-
-    // METHODS
-    T_Value& atDefault() { return m_defaultValue; }
-    const T_Value& atDefault() const { return m_defaultValue; }
-    const Deque& privateDeque() const { return m_deque; }
-
-    // Size. Verilog: function int size(), or int num()
-    int size() const { return m_deque.size(); }
-    // Clear array. Verilog: function void delete([input index])
-    void clear() { m_deque.clear(); }
-    void erase(vlsint32_t index) {
-        if (VL_LIKELY(index >= 0 && index < m_deque.size()))
-            m_deque.erase(m_deque.begin() + index);
-    }
-
-    // Dynamic array new[] becomes a renew()
-    void renew(size_t size) {
-        clear();
-        m_deque.resize(size, atDefault());
-    }
-    // Dynamic array new[]() becomes a renew_copy()
-    void renew_copy(size_t size, const VlQueue<T_Value, T_MaxSize>& rhs) {
-        if (size == 0) {
-            clear();
-        } else {
-            *this = rhs;
-            m_deque.resize(size, atDefault());
-        }
-    }
-
-    // function void q.push_front(value)
-    void push_front(const T_Value& value) {
-        m_deque.push_front(value);
-        if (VL_UNLIKELY(T_MaxSize != 0 && m_deque.size() > T_MaxSize)) m_deque.pop_back();
-    }
-    // function void q.push_back(value)
-    void push_back(const T_Value& value) {
-        if (VL_LIKELY(T_MaxSize == 0 || m_deque.size() < T_MaxSize)) m_deque.push_back(value);
-    }
-    // function value_t q.pop_front();
-    T_Value pop_front() {
-        if (m_deque.empty()) return m_defaultValue;
-        T_Value v = m_deque.front();
-        m_deque.pop_front();
-        return v;
-    }
-    // function value_t q.pop_back();
-    T_Value pop_back() {
-        if (m_deque.empty()) return m_defaultValue;
-        T_Value v = m_deque.back();
-        m_deque.pop_back();
-        return v;
-    }
-
-    // Setting. Verilog: assoc[index] = v
-    // Can't just overload operator[] or provide a "at" reference to set,
-    // because we need to be able to insert only when the value is set
-    T_Value& at(vlsint32_t index) {
-        static T_Value s_throwAway;
-        // Needs to work for dynamic arrays, so does not use T_MaxSize
-        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) {
-            s_throwAway = atDefault();
-            return s_throwAway;
-        } else {
-            return m_deque[index];
-        }
-    }
-    // Accessing. Verilog: v = assoc[index]
-    const T_Value& at(vlsint32_t index) const {
-        static T_Value s_throwAway;
-        // Needs to work for dynamic arrays, so does not use T_MaxSize
-        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) {
-            return atDefault();
-        } else {
-            return m_deque[index];
-        }
-    }
-    // function void q.insert(index, value);
-    void insert(vlsint32_t index, const T_Value& value) {
-        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) return;
-        m_deque.insert(m_deque.begin() + index, value);
-    }
-
-    // Return slice q[lsb:msb]
-    VlQueue slice(vlsint32_t lsb, vlsint32_t msb) const {
-        VlQueue out;
-        if (VL_UNLIKELY(lsb < 0)) lsb = 0;
-        if (VL_UNLIKELY(lsb >= m_deque.size())) lsb = m_deque.size() - 1;
-        if (VL_UNLIKELY(msb >= m_deque.size())) msb = m_deque.size() - 1;
-        for (vlsint32_t i = lsb; i <= msb; ++i) out.push_back(m_deque[i]);
-        return out;
-    }
-
-    // For save/restore
-    const_iterator begin() const { return m_deque.begin(); }
-    const_iterator end() const { return m_deque.end(); }
-
-    // Methods
-    void sort() { std::sort(m_deque.begin(), m_deque.end()); }
-    template <typename Func> void sort(Func with_func) {
-        // with_func returns arbitrary type to use for the sort comparison
-        std::sort(m_deque.begin(), m_deque.end(), [=](const T_Value& a, const T_Value& b) {
-            // index number is meaninless with sort, as it changes
-            return with_func(0, a) < with_func(0, b);
-        });
-    }
-    void rsort() { std::sort(m_deque.rbegin(), m_deque.rend()); }
-    template <typename Func> void rsort(Func with_func) {
-        // with_func returns arbitrary type to use for the sort comparison
-        std::sort(m_deque.rbegin(), m_deque.rend(), [=](const T_Value& a, const T_Value& b) {
-            // index number is meaninless with sort, as it changes
-            return with_func(0, a) < with_func(0, b);
-        });
-    }
-    void reverse() { std::reverse(m_deque.begin(), m_deque.end()); }
-    void shuffle() { std::shuffle(m_deque.begin(), m_deque.end(), VlURNG{}); }
-    VlQueue unique() const {
-        VlQueue out;
-        std::unordered_set<T_Value> saw;
-        for (const auto& i : m_deque) {
-            auto it = saw.find(i);
-            if (it == saw.end()) {
-                saw.insert(it, i);
-                out.push_back(i);
-            }
-        }
-        return out;
-    }
-    VlQueue<IData> unique_index() const {
-        VlQueue<IData> out;
-        IData index = 0;
-        std::unordered_set<T_Value> saw;
-        for (const auto& i : m_deque) {
-            auto it = saw.find(i);
-            if (it == saw.end()) {
-                saw.insert(it, i);
-                out.push_back(index);
-            }
-            ++index;
-        }
-        return out;
-    }
-    template <typename Func> VlQueue find(Func with_func) const {
-        VlQueue out;
-        IData index = 0;
-        for (const auto& i : m_deque) {
-            if (with_func(index, i)) out.push_back(i);
-            ++index;
-        }
-        return out;
-    }
-    template <typename Func> VlQueue<IData> find_index(Func with_func) const {
-        VlQueue<IData> out;
-        IData index = 0;
-        for (const auto& i : m_deque) {
-            if (with_func(index, i)) out.push_back(index);
-            ++index;
-        }
-        return out;
-    }
-    template <typename Func> VlQueue find_first(Func with_func) const {
-        // Can't use std::find_if as need index number
-        IData index = 0;
-        for (const auto& i : m_deque) {
-            if (with_func(index, i)) return VlQueue::cons(i);
-            ++index;
-        }
-        return VlQueue{};
-    }
-    template <typename Func> VlQueue<IData> find_first_index(Func with_func) const {
-        IData index = 0;
-        for (const auto& i : m_deque) {
-            if (with_func(index, i)) return VlQueue<IData>::cons(index);
-            ++index;
-        }
-        return VlQueue<IData>{};
-    }
-    template <typename Func> VlQueue find_last(Func with_func) const {
-        IData index = m_deque.size() - 1;
-        for (auto it = m_deque.rbegin(); it != m_deque.rend(); ++it) {
-            if (with_func(index, *it)) return VlQueue::cons(*it);
-            --index;
-        }
-        return VlQueue{};
-    }
-    template <typename Func> VlQueue<IData> find_last_index(Func with_func) const {
-        IData index = m_deque.size() - 1;
-        for (auto it = m_deque.rbegin(); it != m_deque.rend(); ++it) {
-            if (with_func(index, *it)) return VlQueue<IData>::cons(index);
-            --index;
-        }
-        return VlQueue<IData>{};
-    }
-
-    // Reduction operators
-    VlQueue min() const {
-        if (m_deque.empty()) return VlQueue{};
-        const auto it = std::min_element(m_deque.begin(), m_deque.end());
-        return VlQueue::cons(*it);
-    }
-    VlQueue max() const {
-        if (m_deque.empty()) return VlQueue{};
-        const auto it = std::max_element(m_deque.begin(), m_deque.end());
-        return VlQueue::cons(*it);
-    }
-
-    T_Value r_sum() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_deque) out += i;
-        return out;
-    }
-    template <typename Func> T_Value r_sum(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        IData index = 0;
-        for (const auto& i : m_deque) out += with_func(index++, i);
-        return out;
-    }
-    T_Value r_product() const {
-        if (m_deque.empty()) return T_Value(0);
-        auto it = m_deque.begin();
-        T_Value out{*it};
-        ++it;
-        for (; it != m_deque.end(); ++it) out *= *it;
-        return out;
-    }
-    template <typename Func> T_Value r_product(Func with_func) const {
-        if (m_deque.empty()) return T_Value(0);
-        auto it = m_deque.begin();
-        IData index = 0;
-        T_Value out{with_func(index, *it)};
-        ++it;
-        ++index;
-        for (; it != m_deque.end(); ++it) out *= with_func(index++, *it);
-        return out;
-    }
-    T_Value r_and() const {
-        if (m_deque.empty()) return T_Value(0);
-        auto it = m_deque.begin();
-        T_Value out{*it};
-        ++it;
-        for (; it != m_deque.end(); ++it) out &= *it;
-        return out;
-    }
-    template <typename Func> T_Value r_and(Func with_func) const {
-        if (m_deque.empty()) return T_Value(0);
-        auto it = m_deque.begin();
-        IData index = 0;
-        T_Value out{with_func(index, *it)};
-        ++it;
-        ++index;
-        for (; it != m_deque.end(); ++it) out &= with_func(index, *it);
-        return out;
-    }
-    T_Value r_or() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_deque) out |= i;
-        return out;
-    }
-    template <typename Func> T_Value r_or(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        IData index = 0;
-        for (const auto& i : m_deque) out |= with_func(index++, i);
-        return out;
-    }
-    T_Value r_xor() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_deque) out ^= i;
-        return out;
-    }
-    template <typename Func> T_Value r_xor(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        IData index = 0;
-        for (const auto& i : m_deque) out ^= with_func(index++, i);
-        return out;
-    }
-
-    // Dumping. Verilog: str = $sformatf("%p", assoc)
-    std::string to_string() const {
-        if (m_deque.empty()) return "'{}";  // No trailing space
-        std::string out = "'{";
-        std::string comma;
-        for (const auto& i : m_deque) {
-            out += comma + VL_TO_STRING(i);
-            comma = ", ";
-        }
-        return out + "} ";
-    }
-};
-
-template <class T_Value> std::string VL_TO_STRING(const VlQueue<T_Value>& obj) {
-    return obj.to_string();
-}
-
-//===================================================================
-// Verilog associative array container
-// There are no multithreaded locks on this; the base variable must
-// be protected by other means
-//
-template <class T_Key, class T_Value> class VlAssocArray final {
-private:
-    // TYPES
-    using Map = std::map<T_Key, T_Value>;
-
-public:
-    using const_iterator = typename Map::const_iterator;
-
-private:
-    // MEMBERS
-    Map m_map;  // State of the assoc array
-    T_Value m_defaultValue;  // Default value
-
-public:
-    // CONSTRUCTORS
-    // m_defaultValue isn't defaulted. Caller's constructor must do it.
-    VlAssocArray() = default;
-    ~VlAssocArray() = default;
-    VlAssocArray(const VlAssocArray&) = default;
-    VlAssocArray(VlAssocArray&&) = default;
-    VlAssocArray& operator=(const VlAssocArray&) = default;
-    VlAssocArray& operator=(VlAssocArray&&) = default;
-
-    // METHODS
-    T_Value& atDefault() { return m_defaultValue; }
-    const T_Value& atDefault() const { return m_defaultValue; }
-
-    // Size of array. Verilog: function int size(), or int num()
-    int size() const { return m_map.size(); }
-    // Clear array. Verilog: function void delete([input index])
-    void clear() { m_map.clear(); }
-    void erase(const T_Key& index) { m_map.erase(index); }
-    // Return 0/1 if element exists. Verilog: function int exists(input index)
-    int exists(const T_Key& index) const { return m_map.find(index) != m_map.end(); }
-    // Return first element.  Verilog: function int first(ref index);
-    int first(T_Key& indexr) const {
-        const auto it = m_map.cbegin();
-        if (it == m_map.end()) return 0;
-        indexr = it->first;
-        return 1;
-    }
-    // Return last element.  Verilog: function int last(ref index)
-    int last(T_Key& indexr) const {
-        const auto it = m_map.crbegin();
-        if (it == m_map.rend()) return 0;
-        indexr = it->first;
-        return 1;
-    }
-    // Return next element. Verilog: function int next(ref index)
-    int next(T_Key& indexr) const {
-        auto it = m_map.find(indexr);
-        if (VL_UNLIKELY(it == m_map.end())) return 0;
-        ++it;
-        if (VL_UNLIKELY(it == m_map.end())) return 0;
-        indexr = it->first;
-        return 1;
-    }
-    // Return prev element. Verilog: function int prev(ref index)
-    int prev(T_Key& indexr) const {
-        auto it = m_map.find(indexr);
-        if (VL_UNLIKELY(it == m_map.end())) return 0;
-        if (VL_UNLIKELY(it == m_map.begin())) return 0;
-        --it;
-        indexr = it->first;
-        return 1;
-    }
-    // Setting. Verilog: assoc[index] = v
-    // Can't just overload operator[] or provide a "at" reference to set,
-    // because we need to be able to insert only when the value is set
-    T_Value& at(const T_Key& index) {
-        const auto it = m_map.find(index);
-        if (it == m_map.end()) {
-            std::pair<typename Map::iterator, bool> pit = m_map.emplace(index, m_defaultValue);
-            return pit.first->second;
-        }
-        return it->second;
-    }
-    // Accessing. Verilog: v = assoc[index]
-    const T_Value& at(const T_Key& index) const {
-        const auto it = m_map.find(index);
-        if (it == m_map.end()) {
-            return m_defaultValue;
-        } else {
-            return it->second;
-        }
-    }
-    // Setting as a chained operation
-    VlAssocArray& set(const T_Key& index, const T_Value& value) {
-        at(index) = value;
-        return *this;
-    }
-    VlAssocArray& setDefault(const T_Value& value) {
-        atDefault() = value;
-        return *this;
-    }
-
-    // For save/restore
-    const_iterator begin() const { return m_map.begin(); }
-    const_iterator end() const { return m_map.end(); }
-
-    // Methods
-    VlQueue<T_Value> unique() const {
-        VlQueue<T_Value> out;
-        std::set<T_Value> saw;
-        for (const auto& i : m_map) {
-            auto it = saw.find(i.second);
-            if (it == saw.end()) {
-                saw.insert(it, i.second);
-                out.push_back(i.second);
-            }
-        }
-        return out;
-    }
-    VlQueue<T_Key> unique_index() const {
-        VlQueue<T_Key> out;
-        std::set<T_Key> saw;
-        for (const auto& i : m_map) {
-            auto it = saw.find(i.second);
-            if (it == saw.end()) {
-                saw.insert(it, i.second);
-                out.push_back(i.first);
-            }
-        }
-        return out;
-    }
-    template <typename Func> VlQueue<T_Value> find(Func with_func) const {
-        VlQueue<T_Value> out;
-        for (const auto& i : m_map)
-            if (with_func(i.first, i.second)) out.push_back(i.second);
-        return out;
-    }
-    template <typename Func> VlQueue<T_Key> find_index(Func with_func) const {
-        VlQueue<T_Key> out;
-        for (const auto& i : m_map)
-            if (with_func(i.first, i.second)) out.push_back(i.first);
-        return out;
-    }
-    template <typename Func> VlQueue<T_Value> find_first(Func with_func) const {
-        const auto it
-            = std::find_if(m_map.begin(), m_map.end(), [=](const std::pair<T_Key, T_Value>& i) {
-                  return with_func(i.first, i.second);
-              });
-        if (it == m_map.end()) return VlQueue<T_Value>{};
-        return VlQueue<T_Value>::cons(it->second);
-    }
-    template <typename Func> VlQueue<T_Key> find_first_index(Func with_func) const {
-        const auto it
-            = std::find_if(m_map.begin(), m_map.end(), [=](const std::pair<T_Key, T_Value>& i) {
-                  return with_func(i.first, i.second);
-              });
-        if (it == m_map.end()) return VlQueue<T_Value>{};
-        return VlQueue<T_Key>::cons(it->first);
-    }
-    template <typename Func> VlQueue<T_Value> find_last(Func with_func) const {
-        const auto it
-            = std::find_if(m_map.rbegin(), m_map.rend(), [=](const std::pair<T_Key, T_Value>& i) {
-                  return with_func(i.first, i.second);
-              });
-        if (it == m_map.rend()) return VlQueue<T_Value>{};
-        return VlQueue<T_Value>::cons(it->second);
-    }
-    template <typename Func> VlQueue<T_Key> find_last_index(Func with_func) const {
-        const auto it
-            = std::find_if(m_map.rbegin(), m_map.rend(), [=](const std::pair<T_Key, T_Value>& i) {
-                  return with_func(i.first, i.second);
-              });
-        if (it == m_map.rend()) return VlQueue<T_Value>{};
-        return VlQueue<T_Key>::cons(it->first);
-    }
-
-    // Reduction operators
-    VlQueue<T_Value> min() const {
-        if (m_map.empty()) return VlQueue<T_Value>();
-        const auto it = std::min_element(
-            m_map.begin(), m_map.end(),
-            [](const std::pair<T_Key, T_Value>& a, const std::pair<T_Key, T_Value>& b) {
-                return a.second < b.second;
-            });
-        return VlQueue<T_Value>::cons(it->second);
-    }
-    VlQueue<T_Value> max() const {
-        if (m_map.empty()) return VlQueue<T_Value>();
-        const auto it = std::max_element(
-            m_map.begin(), m_map.end(),
-            [](const std::pair<T_Key, T_Value>& a, const std::pair<T_Key, T_Value>& b) {
-                return a.second < b.second;
-            });
-        return VlQueue<T_Value>::cons(it->second);
-    }
-
-    T_Value r_sum() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out += i.second;
-        return out;
-    }
-    template <typename Func> T_Value r_sum(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out += with_func(i.first, i.second);
-        return out;
-    }
-    T_Value r_product() const {
-        if (m_map.empty()) return T_Value(0);
-        auto it = m_map.begin();
-        T_Value out{it->second};
-        ++it;
-        for (; it != m_map.end(); ++it) out *= it->second;
-        return out;
-    }
-    template <typename Func> T_Value r_product(Func with_func) const {
-        if (m_map.empty()) return T_Value(0);
-        auto it = m_map.begin();
-        T_Value out{with_func(it->first, it->second)};
-        ++it;
-        for (; it != m_map.end(); ++it) out *= with_func(it->first, it->second);
-        return out;
-    }
-    T_Value r_and() const {
-        if (m_map.empty()) return T_Value(0);
-        auto it = m_map.begin();
-        T_Value out{it->second};
-        ++it;
-        for (; it != m_map.end(); ++it) out &= it->second;
-        return out;
-    }
-    template <typename Func> T_Value r_and(Func with_func) const {
-        if (m_map.empty()) return T_Value(0);
-        auto it = m_map.begin();
-        T_Value out{with_func(it->first, it->second)};
-        ++it;
-        for (; it != m_map.end(); ++it) out &= with_func(it->first, it->second);
-        return out;
-    }
-    T_Value r_or() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out |= i.second;
-        return out;
-    }
-    template <typename Func> T_Value r_or(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out |= with_func(i.first, i.second);
-        return out;
-    }
-    T_Value r_xor() const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out ^= i.second;
-        return out;
-    }
-    template <typename Func> T_Value r_xor(Func with_func) const {
-        T_Value out(0);  // Type must have assignment operator
-        for (const auto& i : m_map) out ^= with_func(i.first, i.second);
-        return out;
-    }
-
-    // Dumping. Verilog: str = $sformatf("%p", assoc)
-    std::string to_string() const {
-        if (m_map.empty()) return "'{}";  // No trailing space
-        std::string out = "'{";
-        std::string comma;
-        for (const auto& i : m_map) {
-            out += comma + VL_TO_STRING(i.first) + ":" + VL_TO_STRING(i.second);
-            comma = ", ";
-        }
-        // Default not printed - maybe random init data
-        return out + "} ";
-    }
-};
-
-template <class T_Key, class T_Value>
-std::string VL_TO_STRING(const VlAssocArray<T_Key, T_Value>& obj) {
-    return obj.to_string();
-}
-
-template <class T_Key, class T_Value>
-void VL_READMEM_N(bool hex, int bits, const std::string& filename,
-                  VlAssocArray<T_Key, T_Value>& obj, QData start, QData end) VL_MT_SAFE {
-    VlReadMem rmem{hex, bits, filename, start, end};
-    if (VL_UNLIKELY(!rmem.isOpen())) return;
-    while (true) {
-        QData addr;
-        std::string data;
-        if (rmem.get(addr /*ref*/, data /*ref*/)) {
-            rmem.setData(&(obj.at(addr)), data);
-        } else {
-            break;
-        }
-    }
-}
-
-template <class T_Key, class T_Value>
-void VL_WRITEMEM_N(bool hex, int bits, const std::string& filename,
-                   const VlAssocArray<T_Key, T_Value>& obj, QData start, QData end) VL_MT_SAFE {
-    VlWriteMem wmem{hex, bits, filename, start, end};
-    if (VL_UNLIKELY(!wmem.isOpen())) return;
-    for (const auto& i : obj) {
-        const QData addr = i.first;
-        if (addr >= start && addr <= end) wmem.print(addr, true, &(i.second));
-    }
-}
-
-//===================================================================
-/// Verilog unpacked array container
-/// For when a standard C++[] array is not sufficient, e.g. an
-/// array under a queue, or methods operating on the array.
-///
-/// A 'struct' as we want this to be an aggregate type that allows
-/// static aggregate initialization. Consider data members private.
-///
-/// This class may get exposed to a Verilated Model's top I/O, if the top
-/// IO has an unpacked array.
-
-template <class T_Value, std::size_t T_Depth> struct VlUnpacked final {
-    // MEMBERS
-    // This should be the only data member, otherwise generated static initializers need updating
-    T_Value m_storage[T_Depth];  // Contents of the unpacked array
-
-    // CONSTRUCTORS
-    // Default constructors and destructor are used. Note however that C++20 requires that
-    // aggregate types do not have a user declared constructor, not even an explicitly defaulted
-    // one.
-
-    // OPERATOR METHODS
-    // Default copy assignment operators are used.
-
-    // METHODS
-    // Raw access
-    WData* data() { return &m_storage[0]; }
-    const WData* data() const { return &m_storage[0]; }
-
-    T_Value& operator[](size_t index) { return m_storage[index]; };
-    const T_Value& operator[](size_t index) const { return m_storage[index]; };
-
-    // Dumping. Verilog: str = $sformatf("%p", assoc)
-    std::string to_string() const {
-        std::string out = "'{";
-        std::string comma;
-        for (int i = 0; i < T_Depth; ++i) {
-            out += comma + VL_TO_STRING(m_storage[i]);
-            comma = ", ";
-        }
-        return out + "} ";
-    }
-};
-
-template <class T_Value, std::size_t T_Depth>
-std::string VL_TO_STRING(const VlUnpacked<T_Value, T_Depth>& obj) {
-    return obj.to_string();
-}
-
-//===================================================================
-// Verilog class reference container
-// There are no multithreaded locks on this; the base variable must
-// be protected by other means
-
-#define VlClassRef std::shared_ptr
-
-template <class T>  // T typically of type VlClassRef<x>
-inline T VL_NULL_CHECK(T t, const char* filename, int linenum) {
-    if (VL_UNLIKELY(!t)) Verilated::nullPointerError(filename, linenum);
-    return t;
-}
-
-template <typename T, typename U>
-static inline bool VL_CAST_DYNAMIC(VlClassRef<T> in, VlClassRef<U>& outr) {
-    VlClassRef<U> casted = std::dynamic_pointer_cast<U>(in);
-    if (VL_LIKELY(casted)) {
-        outr = casted;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-//======================================================================
-// Conversion functions
-
-extern std::string VL_CVT_PACK_STR_NW(int lwords, const WDataInP lwp) VL_MT_SAFE;
-inline std::string VL_CVT_PACK_STR_NQ(QData lhs) VL_PURE {
-    VlWide<VL_WQ_WORDS_E> lw;
-    VL_SET_WQ(lw, lhs);
-    return VL_CVT_PACK_STR_NW(VL_WQ_WORDS_E, lw);
-}
-inline std::string VL_CVT_PACK_STR_NN(const std::string& lhs) VL_PURE { return lhs; }
-inline std::string& VL_CVT_PACK_STR_NN(std::string& lhs) VL_PURE { return lhs; }
-inline std::string VL_CVT_PACK_STR_NI(IData lhs) VL_PURE {
-    VlWide<VL_WQ_WORDS_E> lw;
-    VL_SET_WI(lw, lhs);
-    return VL_CVT_PACK_STR_NW(1, lw);
-}
-inline std::string VL_CONCATN_NNN(const std::string& lhs, const std::string& rhs) VL_PURE {
-    return lhs + rhs;
-}
-inline std::string VL_REPLICATEN_NNQ(int, int, int, const std::string& lhs, IData rep) VL_PURE {
-    std::string out;
-    out.reserve(lhs.length() * rep);
-    for (unsigned times = 0; times < rep; ++times) out += lhs;
-    return out;
-}
-inline std::string VL_REPLICATEN_NNI(int obits, int lbits, int rbits, const std::string& lhs,
-                                     IData rep) VL_PURE {
-    return VL_REPLICATEN_NNQ(obits, lbits, rbits, lhs, rep);
-}
-
-inline IData VL_LEN_IN(const std::string& ld) { return ld.length(); }
-extern std::string VL_TOLOWER_NN(const std::string& ld);
-extern std::string VL_TOUPPER_NN(const std::string& ld);
-
-extern IData VL_FERROR_IN(IData fpi, std::string& outputr) VL_MT_SAFE;
-extern IData VL_FOPEN_NN(const std::string& filename, const std::string& mode) VL_MT_SAFE;
-extern IData VL_FOPEN_MCD_N(const std::string& filename) VL_MT_SAFE;
-extern void VL_READMEM_N(bool hex, int bits, QData depth, int array_lsb,
-                         const std::string& filename, void* memp, QData start,
-                         QData end) VL_MT_SAFE;
-extern void VL_WRITEMEM_N(bool hex, int bits, QData depth, int array_lsb,
-                          const std::string& filename, const void* memp, QData start,
-                          QData end) VL_MT_SAFE;
-extern IData VL_SSCANF_INX(int lbits, const std::string& ld, const char* formatp, ...) VL_MT_SAFE;
-extern void VL_SFORMAT_X(int obits_ignored, std::string& output, const char* formatp,
-                         ...) VL_MT_SAFE;
-extern std::string VL_SFORMATF_NX(const char* formatp, ...) VL_MT_SAFE;
-extern void VL_TIMEFORMAT_IINI(int units, int precision, const std::string& suffix, int width,
-                               VerilatedContext* contextp) VL_MT_SAFE;
-extern IData VL_VALUEPLUSARGS_INW(int rbits, const std::string& ld, WDataOutP rwp) VL_MT_SAFE;
-inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, CData& rdr) VL_MT_SAFE {
-    VlWide<2> rwp;  // WData must always be at least 2
-    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
-    if (got) rdr = rwp[0];
-    return got;
-}
-inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, SData& rdr) VL_MT_SAFE {
-    VlWide<2> rwp;  // WData must always be at least 2
-    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
-    if (got) rdr = rwp[0];
-    return got;
-}
-inline IData VL_VALUEPLUSARGS_INI(int rbits, const std::string& ld, IData& rdr) VL_MT_SAFE {
-    VlWide<2> rwp;
-    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
-    if (got) rdr = rwp[0];
-    return got;
-}
-inline IData VL_VALUEPLUSARGS_INQ(int rbits, const std::string& ld, QData& rdr) VL_MT_SAFE {
-    VlWide<2> rwp;
-    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
-    if (got) rdr = VL_SET_QW(rwp);
-    return got;
-}
-inline IData VL_VALUEPLUSARGS_INQ(int rbits, const std::string& ld, double& rdr) VL_MT_SAFE {
-    VlWide<2> rwp;
-    IData got = VL_VALUEPLUSARGS_INW(rbits, ld, rwp);
-    if (got) rdr = VL_CVT_D_Q(VL_SET_QW(rwp));
-    return got;
-}
-extern IData VL_VALUEPLUSARGS_INN(int, const std::string& ld, std::string& rdr) VL_MT_SAFE;
-
-//======================================================================
-// Strings
-
-extern std::string VL_PUTC_N(const std::string& lhs, IData rhs, CData ths) VL_PURE;
-extern CData VL_GETC_N(const std::string& lhs, IData rhs) VL_PURE;
-extern std::string VL_SUBSTR_N(const std::string& lhs, IData rhs, IData ths) VL_PURE;
-
-inline IData VL_CMP_NN(const std::string& lhs, const std::string& rhs, bool ignoreCase) VL_PURE {
-    // SystemVerilog does not allow a string variable to contain '\0'.
-    // So C functions such as strcmp() can correctly compare strings.
-    if (ignoreCase) {
-        return VL_STRCASECMP(lhs.c_str(), rhs.c_str());
-    } else {
-        return std::strcmp(lhs.c_str(), rhs.c_str());
-    }
-}
-
-extern IData VL_ATOI_N(const std::string& str, int base) VL_PURE;
-
-extern IData VL_FGETS_NI(std::string& dest, IData fpi);
-
 #endif  // Guard
diff --git a/include/verilated_imp.h b/include/verilated_imp.h
index fef103f7b..ed17f86e2 100644
--- a/include/verilated_imp.h
+++ b/include/verilated_imp.h
@@ -30,7 +30,6 @@
 
 #include "verilatedos.h"
 #include "verilated.h"
-#include "verilated_heavy.h"
 #include "verilated_syms.h"
 
 #include <deque>
diff --git a/include/verilated_save.h b/include/verilated_save.h
index 1a6c14e13..f49c15f91 100644
--- a/include/verilated_save.h
+++ b/include/verilated_save.h
@@ -23,7 +23,7 @@
 #define VERILATOR_VERILATED_SAVE_C_H_
 
 #include "verilatedos.h"
-#include "verilated_heavy.h"
+#include "verilated.h"
 
 #include <string>
 
diff --git a/include/verilated_syms.h b/include/verilated_syms.h
index 1dc633e6e..160bd31c9 100644
--- a/include/verilated_syms.h
+++ b/include/verilated_syms.h
@@ -30,7 +30,7 @@
 #define VERILATOR_VERILATED_SYMS_H_
 
 #include "verilatedos.h"
-#include "verilated_heavy.h"
+#include "verilated.h"
 #include "verilated_sym_props.h"
 
 #include <map>
diff --git a/include/verilated_types.h b/include/verilated_types.h
new file mode 100644
index 000000000..8a0dde96d
--- /dev/null
+++ b/include/verilated_types.h
@@ -0,0 +1,897 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Code available from: https://verilator.org
+//
+// Copyright 2003-2021 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+///
+/// \file
+/// \brief Verilated common data type containers
+///
+/// verilated.h should be included instead of this file.
+///
+/// Those macro/function/variable starting or ending in _ are internal,
+/// however many of the other function/macros here are also internal.
+///
+//*************************************************************************
+
+#ifndef VERILATOR_VERILATED_TYPES_H_
+#define VERILATOR_VERILATED_TYPES_H_
+
+#ifndef VERILATOR_VERILATED_H_INTERNAL_
+#error "verilated_types.h should only be included by verilated.h"
+#endif
+
+//===================================================================
+// String formatters (required by below containers)
+
+extern std::string VL_TO_STRING(CData lhs);
+extern std::string VL_TO_STRING(SData lhs);
+extern std::string VL_TO_STRING(IData lhs);
+extern std::string VL_TO_STRING(QData lhs);
+inline std::string VL_TO_STRING(const std::string& obj) { return "\"" + obj + "\""; }
+extern std::string VL_TO_STRING_W(int words, const WDataInP obj);
+
+//=========================================================================
+// Declare net data types
+
+#define VL_SIG8(name, msb, lsb) CData name  ///< Declare signal, 1-8 bits
+#define VL_SIG16(name, msb, lsb) SData name  ///< Declare signal, 9-16 bits
+#define VL_SIG64(name, msb, lsb) QData name  ///< Declare signal, 33-64 bits
+#define VL_SIG(name, msb, lsb) IData name  ///< Declare signal, 17-32 bits
+#define VL_SIGW(name, msb, lsb, words) WData name[words]  ///< Declare signal, 65+ bits
+#define VL_IN8(name, msb, lsb) CData name  ///< Declare input signal, 1-8 bits
+#define VL_IN16(name, msb, lsb) SData name  ///< Declare input signal, 9-16 bits
+#define VL_IN64(name, msb, lsb) QData name  ///< Declare input signal, 33-64 bits
+#define VL_IN(name, msb, lsb) IData name  ///< Declare input signal, 17-32 bits
+#define VL_INW(name, msb, lsb, words) WData name[words]  ///< Declare input signal, 65+ bits
+#define VL_INOUT8(name, msb, lsb) CData name  ///< Declare bidir signal, 1-8 bits
+#define VL_INOUT16(name, msb, lsb) SData name  ///< Declare bidir signal, 9-16 bits
+#define VL_INOUT64(name, msb, lsb) QData name  ///< Declare bidir signal, 33-64 bits
+#define VL_INOUT(name, msb, lsb) IData name  ///< Declare bidir signal, 17-32 bits
+#define VL_INOUTW(name, msb, lsb, words) WData name[words]  ///< Declare bidir signal, 65+ bits
+#define VL_OUT8(name, msb, lsb) CData name  ///< Declare output signal, 1-8 bits
+#define VL_OUT16(name, msb, lsb) SData name  ///< Declare output signal, 9-16 bits
+#define VL_OUT64(name, msb, lsb) QData name  ///< Declare output signal, 33-64bits
+#define VL_OUT(name, msb, lsb) IData name  ///< Declare output signal, 17-32 bits
+#define VL_OUTW(name, msb, lsb, words) WData name[words]  ///< Declare output signal, 65+ bits
+
+//===================================================================
+// Shuffle RNG
+
+extern vluint64_t vl_rand64() VL_MT_SAFE;
+
+class VlURNG final {
+public:
+    using result_type = size_t;
+    static constexpr size_t min() { return 0; }
+    static constexpr size_t max() { return 1ULL << 31; }
+    size_t operator()() { return VL_MASK_I(31) & vl_rand64(); }
+};
+
+//===================================================================
+// Readmem/Writemem operation classes
+
+class VlReadMem final {
+    bool m_hex;  // Hex format
+    int m_bits;  // Bit width of values
+    const std::string& m_filename;  // Filename
+    QData m_end;  // End address (as specified by user)
+    FILE* m_fp;  // File handle for filename
+    QData m_addr;  // Next address to read
+    int m_linenum;  // Line number last read from file
+public:
+    VlReadMem(bool hex, int bits, const std::string& filename, QData start, QData end);
+    ~VlReadMem();
+    bool isOpen() const { return m_fp != nullptr; }
+    int linenum() const { return m_linenum; }
+    bool get(QData& addrr, std::string& valuer);
+    void setData(void* valuep, const std::string& rhs);
+};
+
+class VlWriteMem final {
+    bool m_hex;  // Hex format
+    int m_bits;  // Bit width of values
+    FILE* m_fp;  // File handle for filename
+    QData m_addr;  // Next address to write
+public:
+    VlWriteMem(bool hex, int bits, const std::string& filename, QData start, QData end);
+    ~VlWriteMem();
+    bool isOpen() const { return m_fp != nullptr; }
+    void print(QData addr, bool addrstamp, const void* valuep);
+};
+
+//===================================================================
+/// Verilog wide packed bit container.
+/// Similar to std::array<WData, N>, but lighter weight, only methods needed
+/// by Verilator, to help compile time.
+///
+/// A 'struct' as we want this to be an aggregate type that allows
+/// static aggregate initialization. Consider data members private.
+///
+/// For example a Verilog "bit [94:0]" will become a VlWide<3> because 3*32
+/// bits are needed to hold the 95 bits. The MSB (bit 96) must always be
+/// zero in memory, but during intermediate operations in the Verilated
+/// internals is unpredictable.
+
+static int _vl_cmp_w(int words, WDataInP const lwp, WDataInP const rwp) VL_MT_SAFE;
+
+template <std::size_t T_Words> struct VlWide final {
+    // MEMBERS
+    // This should be the only data member, otherwise generated static initializers need updating
+    EData m_storage[T_Words];  // Contents of the packed array
+
+    // CONSTRUCTORS
+    // Default constructors and destructor are used. Note however that C++20 requires that
+    // aggregate types do not have a user declared constructor, not even an explicitly defaulted
+    // one.
+
+    // OPERATOR METHODS
+    // Default copy assignment operators are used.
+    operator WDataOutP() { return &m_storage[0]; }  // This also allows []
+    operator WDataInP() const { return &m_storage[0]; }  // This also allows []
+
+    // METHODS
+    const EData& at(size_t index) const { return m_storage[index]; }
+    EData& at(size_t index) { return m_storage[index]; }
+    WData* data() { return &m_storage[0]; }
+    const WData* data() const { return &m_storage[0]; }
+    bool operator<(const VlWide<T_Words>& rhs) const {
+        return _vl_cmp_w(T_Words, data(), rhs.data()) < 0;
+    }
+};
+
+// Convert a C array to std::array reference by pointer magic, without copy.
+// Data type (second argument) is so the function template can automatically generate.
+template <std::size_t T_Words>
+VlWide<T_Words>& VL_CVT_W_A(const WDataInP inp, const VlWide<T_Words>&) {
+    return *((VlWide<T_Words>*)inp);
+}
+
+template <std::size_t T_Words> std::string VL_TO_STRING(const VlWide<T_Words>& obj) {
+    return VL_TO_STRING_W(T_Words, obj.data());
+}
+
+//===================================================================
+// Verilog queue and dynamic array container
+// There are no multithreaded locks on this; the base variable must
+// be protected by other means
+//
+// Bound here is the maximum size() allowed, e.g. 1 + SystemVerilog bound
+// For dynamic arrays it is always zero
+template <class T_Value, size_t T_MaxSize = 0> class VlQueue final {
+private:
+    // TYPES
+    using Deque = std::deque<T_Value>;
+
+public:
+    using const_iterator = typename Deque::const_iterator;
+
+private:
+    // MEMBERS
+    Deque m_deque;  // State of the assoc array
+    T_Value m_defaultValue;  // Default value
+
+public:
+    // CONSTRUCTORS
+    // m_defaultValue isn't defaulted. Caller's constructor must do it.
+    VlQueue() = default;
+    ~VlQueue() = default;
+    VlQueue(const VlQueue&) = default;
+    VlQueue(VlQueue&&) = default;
+    VlQueue& operator=(const VlQueue&) = default;
+    VlQueue& operator=(VlQueue&&) = default;
+
+    // Standard copy constructor works. Verilog: assoca = assocb
+    // Also must allow conversion from a different T_MaxSize queue
+    template <size_t U_MaxSize = 0> VlQueue operator=(const VlQueue<T_Value, U_MaxSize>& rhs) {
+        m_deque = rhs.privateDeque();
+        if (VL_UNLIKELY(T_MaxSize && T_MaxSize < m_deque.size())) m_deque.resize(T_MaxSize - 1);
+        return *this;
+    }
+
+    static VlQueue cons(const T_Value& lhs) {
+        VlQueue out;
+        out.push_back(lhs);
+        return out;
+    }
+    static VlQueue cons(const T_Value& lhs, const T_Value& rhs) {
+        VlQueue out;
+        out.push_back(rhs);
+        out.push_back(lhs);
+        return out;
+    }
+    static VlQueue cons(const VlQueue& lhs, const T_Value& rhs) {
+        VlQueue out = lhs;
+        out.push_front(rhs);
+        return out;
+    }
+    static VlQueue cons(const T_Value& lhs, const VlQueue& rhs) {
+        VlQueue out = rhs;
+        out.push_back(lhs);
+        return out;
+    }
+    static VlQueue cons(const VlQueue& lhs, const VlQueue& rhs) {
+        VlQueue out = rhs;
+        for (const auto& i : lhs.m_deque) out.push_back(i);
+        return out;
+    }
+
+    // METHODS
+    T_Value& atDefault() { return m_defaultValue; }
+    const T_Value& atDefault() const { return m_defaultValue; }
+    const Deque& privateDeque() const { return m_deque; }
+
+    // Size. Verilog: function int size(), or int num()
+    int size() const { return m_deque.size(); }
+    // Clear array. Verilog: function void delete([input index])
+    void clear() { m_deque.clear(); }
+    void erase(vlsint32_t index) {
+        if (VL_LIKELY(index >= 0 && index < m_deque.size()))
+            m_deque.erase(m_deque.begin() + index);
+    }
+
+    // Dynamic array new[] becomes a renew()
+    void renew(size_t size) {
+        clear();
+        m_deque.resize(size, atDefault());
+    }
+    // Dynamic array new[]() becomes a renew_copy()
+    void renew_copy(size_t size, const VlQueue<T_Value, T_MaxSize>& rhs) {
+        if (size == 0) {
+            clear();
+        } else {
+            *this = rhs;
+            m_deque.resize(size, atDefault());
+        }
+    }
+
+    // function void q.push_front(value)
+    void push_front(const T_Value& value) {
+        m_deque.push_front(value);
+        if (VL_UNLIKELY(T_MaxSize != 0 && m_deque.size() > T_MaxSize)) m_deque.pop_back();
+    }
+    // function void q.push_back(value)
+    void push_back(const T_Value& value) {
+        if (VL_LIKELY(T_MaxSize == 0 || m_deque.size() < T_MaxSize)) m_deque.push_back(value);
+    }
+    // function value_t q.pop_front();
+    T_Value pop_front() {
+        if (m_deque.empty()) return m_defaultValue;
+        T_Value v = m_deque.front();
+        m_deque.pop_front();
+        return v;
+    }
+    // function value_t q.pop_back();
+    T_Value pop_back() {
+        if (m_deque.empty()) return m_defaultValue;
+        T_Value v = m_deque.back();
+        m_deque.pop_back();
+        return v;
+    }
+
+    // Setting. Verilog: assoc[index] = v
+    // Can't just overload operator[] or provide a "at" reference to set,
+    // because we need to be able to insert only when the value is set
+    T_Value& at(vlsint32_t index) {
+        static T_Value s_throwAway;
+        // Needs to work for dynamic arrays, so does not use T_MaxSize
+        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) {
+            s_throwAway = atDefault();
+            return s_throwAway;
+        } else {
+            return m_deque[index];
+        }
+    }
+    // Accessing. Verilog: v = assoc[index]
+    const T_Value& at(vlsint32_t index) const {
+        static T_Value s_throwAway;
+        // Needs to work for dynamic arrays, so does not use T_MaxSize
+        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) {
+            return atDefault();
+        } else {
+            return m_deque[index];
+        }
+    }
+    // function void q.insert(index, value);
+    void insert(vlsint32_t index, const T_Value& value) {
+        if (VL_UNLIKELY(index < 0 || index >= m_deque.size())) return;
+        m_deque.insert(m_deque.begin() + index, value);
+    }
+
+    // Return slice q[lsb:msb]
+    VlQueue slice(vlsint32_t lsb, vlsint32_t msb) const {
+        VlQueue out;
+        if (VL_UNLIKELY(lsb < 0)) lsb = 0;
+        if (VL_UNLIKELY(lsb >= m_deque.size())) lsb = m_deque.size() - 1;
+        if (VL_UNLIKELY(msb >= m_deque.size())) msb = m_deque.size() - 1;
+        for (vlsint32_t i = lsb; i <= msb; ++i) out.push_back(m_deque[i]);
+        return out;
+    }
+
+    // For save/restore
+    const_iterator begin() const { return m_deque.begin(); }
+    const_iterator end() const { return m_deque.end(); }
+
+    // Methods
+    void sort() { std::sort(m_deque.begin(), m_deque.end()); }
+    template <typename Func> void sort(Func with_func) {
+        // with_func returns arbitrary type to use for the sort comparison
+        std::sort(m_deque.begin(), m_deque.end(), [=](const T_Value& a, const T_Value& b) {
+            // index number is meaninless with sort, as it changes
+            return with_func(0, a) < with_func(0, b);
+        });
+    }
+    void rsort() { std::sort(m_deque.rbegin(), m_deque.rend()); }
+    template <typename Func> void rsort(Func with_func) {
+        // with_func returns arbitrary type to use for the sort comparison
+        std::sort(m_deque.rbegin(), m_deque.rend(), [=](const T_Value& a, const T_Value& b) {
+            // index number is meaninless with sort, as it changes
+            return with_func(0, a) < with_func(0, b);
+        });
+    }
+    void reverse() { std::reverse(m_deque.begin(), m_deque.end()); }
+    void shuffle() { std::shuffle(m_deque.begin(), m_deque.end(), VlURNG{}); }
+    VlQueue unique() const {
+        VlQueue out;
+        std::unordered_set<T_Value> saw;
+        for (const auto& i : m_deque) {
+            auto it = saw.find(i);
+            if (it == saw.end()) {
+                saw.insert(it, i);
+                out.push_back(i);
+            }
+        }
+        return out;
+    }
+    VlQueue<IData> unique_index() const {
+        VlQueue<IData> out;
+        IData index = 0;
+        std::unordered_set<T_Value> saw;
+        for (const auto& i : m_deque) {
+            auto it = saw.find(i);
+            if (it == saw.end()) {
+                saw.insert(it, i);
+                out.push_back(index);
+            }
+            ++index;
+        }
+        return out;
+    }
+    template <typename Func> VlQueue find(Func with_func) const {
+        VlQueue out;
+        IData index = 0;
+        for (const auto& i : m_deque) {
+            if (with_func(index, i)) out.push_back(i);
+            ++index;
+        }
+        return out;
+    }
+    template <typename Func> VlQueue<IData> find_index(Func with_func) const {
+        VlQueue<IData> out;
+        IData index = 0;
+        for (const auto& i : m_deque) {
+            if (with_func(index, i)) out.push_back(index);
+            ++index;
+        }
+        return out;
+    }
+    template <typename Func> VlQueue find_first(Func with_func) const {
+        // Can't use std::find_if as need index number
+        IData index = 0;
+        for (const auto& i : m_deque) {
+            if (with_func(index, i)) return VlQueue::cons(i);
+            ++index;
+        }
+        return VlQueue{};
+    }
+    template <typename Func> VlQueue<IData> find_first_index(Func with_func) const {
+        IData index = 0;
+        for (const auto& i : m_deque) {
+            if (with_func(index, i)) return VlQueue<IData>::cons(index);
+            ++index;
+        }
+        return VlQueue<IData>{};
+    }
+    template <typename Func> VlQueue find_last(Func with_func) const {
+        IData index = m_deque.size() - 1;
+        for (auto it = m_deque.rbegin(); it != m_deque.rend(); ++it) {
+            if (with_func(index, *it)) return VlQueue::cons(*it);
+            --index;
+        }
+        return VlQueue{};
+    }
+    template <typename Func> VlQueue<IData> find_last_index(Func with_func) const {
+        IData index = m_deque.size() - 1;
+        for (auto it = m_deque.rbegin(); it != m_deque.rend(); ++it) {
+            if (with_func(index, *it)) return VlQueue<IData>::cons(index);
+            --index;
+        }
+        return VlQueue<IData>{};
+    }
+
+    // Reduction operators
+    VlQueue min() const {
+        if (m_deque.empty()) return VlQueue{};
+        const auto it = std::min_element(m_deque.begin(), m_deque.end());
+        return VlQueue::cons(*it);
+    }
+    VlQueue max() const {
+        if (m_deque.empty()) return VlQueue{};
+        const auto it = std::max_element(m_deque.begin(), m_deque.end());
+        return VlQueue::cons(*it);
+    }
+
+    T_Value r_sum() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_deque) out += i;
+        return out;
+    }
+    template <typename Func> T_Value r_sum(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        IData index = 0;
+        for (const auto& i : m_deque) out += with_func(index++, i);
+        return out;
+    }
+    T_Value r_product() const {
+        if (m_deque.empty()) return T_Value(0);
+        auto it = m_deque.begin();
+        T_Value out{*it};
+        ++it;
+        for (; it != m_deque.end(); ++it) out *= *it;
+        return out;
+    }
+    template <typename Func> T_Value r_product(Func with_func) const {
+        if (m_deque.empty()) return T_Value(0);
+        auto it = m_deque.begin();
+        IData index = 0;
+        T_Value out{with_func(index, *it)};
+        ++it;
+        ++index;
+        for (; it != m_deque.end(); ++it) out *= with_func(index++, *it);
+        return out;
+    }
+    T_Value r_and() const {
+        if (m_deque.empty()) return T_Value(0);
+        auto it = m_deque.begin();
+        T_Value out{*it};
+        ++it;
+        for (; it != m_deque.end(); ++it) out &= *it;
+        return out;
+    }
+    template <typename Func> T_Value r_and(Func with_func) const {
+        if (m_deque.empty()) return T_Value(0);
+        auto it = m_deque.begin();
+        IData index = 0;
+        T_Value out{with_func(index, *it)};
+        ++it;
+        ++index;
+        for (; it != m_deque.end(); ++it) out &= with_func(index, *it);
+        return out;
+    }
+    T_Value r_or() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_deque) out |= i;
+        return out;
+    }
+    template <typename Func> T_Value r_or(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        IData index = 0;
+        for (const auto& i : m_deque) out |= with_func(index++, i);
+        return out;
+    }
+    T_Value r_xor() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_deque) out ^= i;
+        return out;
+    }
+    template <typename Func> T_Value r_xor(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        IData index = 0;
+        for (const auto& i : m_deque) out ^= with_func(index++, i);
+        return out;
+    }
+
+    // Dumping. Verilog: str = $sformatf("%p", assoc)
+    std::string to_string() const {
+        if (m_deque.empty()) return "'{}";  // No trailing space
+        std::string out = "'{";
+        std::string comma;
+        for (const auto& i : m_deque) {
+            out += comma + VL_TO_STRING(i);
+            comma = ", ";
+        }
+        return out + "} ";
+    }
+};
+
+template <class T_Value> std::string VL_TO_STRING(const VlQueue<T_Value>& obj) {
+    return obj.to_string();
+}
+
+//===================================================================
+// Verilog associative array container
+// There are no multithreaded locks on this; the base variable must
+// be protected by other means
+//
+template <class T_Key, class T_Value> class VlAssocArray final {
+private:
+    // TYPES
+    using Map = std::map<T_Key, T_Value>;
+
+public:
+    using const_iterator = typename Map::const_iterator;
+
+private:
+    // MEMBERS
+    Map m_map;  // State of the assoc array
+    T_Value m_defaultValue;  // Default value
+
+public:
+    // CONSTRUCTORS
+    // m_defaultValue isn't defaulted. Caller's constructor must do it.
+    VlAssocArray() = default;
+    ~VlAssocArray() = default;
+    VlAssocArray(const VlAssocArray&) = default;
+    VlAssocArray(VlAssocArray&&) = default;
+    VlAssocArray& operator=(const VlAssocArray&) = default;
+    VlAssocArray& operator=(VlAssocArray&&) = default;
+
+    // METHODS
+    T_Value& atDefault() { return m_defaultValue; }
+    const T_Value& atDefault() const { return m_defaultValue; }
+
+    // Size of array. Verilog: function int size(), or int num()
+    int size() const { return m_map.size(); }
+    // Clear array. Verilog: function void delete([input index])
+    void clear() { m_map.clear(); }
+    void erase(const T_Key& index) { m_map.erase(index); }
+    // Return 0/1 if element exists. Verilog: function int exists(input index)
+    int exists(const T_Key& index) const { return m_map.find(index) != m_map.end(); }
+    // Return first element.  Verilog: function int first(ref index);
+    int first(T_Key& indexr) const {
+        const auto it = m_map.cbegin();
+        if (it == m_map.end()) return 0;
+        indexr = it->first;
+        return 1;
+    }
+    // Return last element.  Verilog: function int last(ref index)
+    int last(T_Key& indexr) const {
+        const auto it = m_map.crbegin();
+        if (it == m_map.rend()) return 0;
+        indexr = it->first;
+        return 1;
+    }
+    // Return next element. Verilog: function int next(ref index)
+    int next(T_Key& indexr) const {
+        auto it = m_map.find(indexr);
+        if (VL_UNLIKELY(it == m_map.end())) return 0;
+        ++it;
+        if (VL_UNLIKELY(it == m_map.end())) return 0;
+        indexr = it->first;
+        return 1;
+    }
+    // Return prev element. Verilog: function int prev(ref index)
+    int prev(T_Key& indexr) const {
+        auto it = m_map.find(indexr);
+        if (VL_UNLIKELY(it == m_map.end())) return 0;
+        if (VL_UNLIKELY(it == m_map.begin())) return 0;
+        --it;
+        indexr = it->first;
+        return 1;
+    }
+    // Setting. Verilog: assoc[index] = v
+    // Can't just overload operator[] or provide a "at" reference to set,
+    // because we need to be able to insert only when the value is set
+    T_Value& at(const T_Key& index) {
+        const auto it = m_map.find(index);
+        if (it == m_map.end()) {
+            std::pair<typename Map::iterator, bool> pit = m_map.emplace(index, m_defaultValue);
+            return pit.first->second;
+        }
+        return it->second;
+    }
+    // Accessing. Verilog: v = assoc[index]
+    const T_Value& at(const T_Key& index) const {
+        const auto it = m_map.find(index);
+        if (it == m_map.end()) {
+            return m_defaultValue;
+        } else {
+            return it->second;
+        }
+    }
+    // Setting as a chained operation
+    VlAssocArray& set(const T_Key& index, const T_Value& value) {
+        at(index) = value;
+        return *this;
+    }
+    VlAssocArray& setDefault(const T_Value& value) {
+        atDefault() = value;
+        return *this;
+    }
+
+    // For save/restore
+    const_iterator begin() const { return m_map.begin(); }
+    const_iterator end() const { return m_map.end(); }
+
+    // Methods
+    VlQueue<T_Value> unique() const {
+        VlQueue<T_Value> out;
+        std::set<T_Value> saw;
+        for (const auto& i : m_map) {
+            auto it = saw.find(i.second);
+            if (it == saw.end()) {
+                saw.insert(it, i.second);
+                out.push_back(i.second);
+            }
+        }
+        return out;
+    }
+    VlQueue<T_Key> unique_index() const {
+        VlQueue<T_Key> out;
+        std::set<T_Key> saw;
+        for (const auto& i : m_map) {
+            auto it = saw.find(i.second);
+            if (it == saw.end()) {
+                saw.insert(it, i.second);
+                out.push_back(i.first);
+            }
+        }
+        return out;
+    }
+    template <typename Func> VlQueue<T_Value> find(Func with_func) const {
+        VlQueue<T_Value> out;
+        for (const auto& i : m_map)
+            if (with_func(i.first, i.second)) out.push_back(i.second);
+        return out;
+    }
+    template <typename Func> VlQueue<T_Key> find_index(Func with_func) const {
+        VlQueue<T_Key> out;
+        for (const auto& i : m_map)
+            if (with_func(i.first, i.second)) out.push_back(i.first);
+        return out;
+    }
+    template <typename Func> VlQueue<T_Value> find_first(Func with_func) const {
+        const auto it
+            = std::find_if(m_map.begin(), m_map.end(), [=](const std::pair<T_Key, T_Value>& i) {
+                  return with_func(i.first, i.second);
+              });
+        if (it == m_map.end()) return VlQueue<T_Value>{};
+        return VlQueue<T_Value>::cons(it->second);
+    }
+    template <typename Func> VlQueue<T_Key> find_first_index(Func with_func) const {
+        const auto it
+            = std::find_if(m_map.begin(), m_map.end(), [=](const std::pair<T_Key, T_Value>& i) {
+                  return with_func(i.first, i.second);
+              });
+        if (it == m_map.end()) return VlQueue<T_Value>{};
+        return VlQueue<T_Key>::cons(it->first);
+    }
+    template <typename Func> VlQueue<T_Value> find_last(Func with_func) const {
+        const auto it
+            = std::find_if(m_map.rbegin(), m_map.rend(), [=](const std::pair<T_Key, T_Value>& i) {
+                  return with_func(i.first, i.second);
+              });
+        if (it == m_map.rend()) return VlQueue<T_Value>{};
+        return VlQueue<T_Value>::cons(it->second);
+    }
+    template <typename Func> VlQueue<T_Key> find_last_index(Func with_func) const {
+        const auto it
+            = std::find_if(m_map.rbegin(), m_map.rend(), [=](const std::pair<T_Key, T_Value>& i) {
+                  return with_func(i.first, i.second);
+              });
+        if (it == m_map.rend()) return VlQueue<T_Value>{};
+        return VlQueue<T_Key>::cons(it->first);
+    }
+
+    // Reduction operators
+    VlQueue<T_Value> min() const {
+        if (m_map.empty()) return VlQueue<T_Value>();
+        const auto it = std::min_element(
+            m_map.begin(), m_map.end(),
+            [](const std::pair<T_Key, T_Value>& a, const std::pair<T_Key, T_Value>& b) {
+                return a.second < b.second;
+            });
+        return VlQueue<T_Value>::cons(it->second);
+    }
+    VlQueue<T_Value> max() const {
+        if (m_map.empty()) return VlQueue<T_Value>();
+        const auto it = std::max_element(
+            m_map.begin(), m_map.end(),
+            [](const std::pair<T_Key, T_Value>& a, const std::pair<T_Key, T_Value>& b) {
+                return a.second < b.second;
+            });
+        return VlQueue<T_Value>::cons(it->second);
+    }
+
+    T_Value r_sum() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out += i.second;
+        return out;
+    }
+    template <typename Func> T_Value r_sum(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out += with_func(i.first, i.second);
+        return out;
+    }
+    T_Value r_product() const {
+        if (m_map.empty()) return T_Value(0);
+        auto it = m_map.begin();
+        T_Value out{it->second};
+        ++it;
+        for (; it != m_map.end(); ++it) out *= it->second;
+        return out;
+    }
+    template <typename Func> T_Value r_product(Func with_func) const {
+        if (m_map.empty()) return T_Value(0);
+        auto it = m_map.begin();
+        T_Value out{with_func(it->first, it->second)};
+        ++it;
+        for (; it != m_map.end(); ++it) out *= with_func(it->first, it->second);
+        return out;
+    }
+    T_Value r_and() const {
+        if (m_map.empty()) return T_Value(0);
+        auto it = m_map.begin();
+        T_Value out{it->second};
+        ++it;
+        for (; it != m_map.end(); ++it) out &= it->second;
+        return out;
+    }
+    template <typename Func> T_Value r_and(Func with_func) const {
+        if (m_map.empty()) return T_Value(0);
+        auto it = m_map.begin();
+        T_Value out{with_func(it->first, it->second)};
+        ++it;
+        for (; it != m_map.end(); ++it) out &= with_func(it->first, it->second);
+        return out;
+    }
+    T_Value r_or() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out |= i.second;
+        return out;
+    }
+    template <typename Func> T_Value r_or(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out |= with_func(i.first, i.second);
+        return out;
+    }
+    T_Value r_xor() const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out ^= i.second;
+        return out;
+    }
+    template <typename Func> T_Value r_xor(Func with_func) const {
+        T_Value out(0);  // Type must have assignment operator
+        for (const auto& i : m_map) out ^= with_func(i.first, i.second);
+        return out;
+    }
+
+    // Dumping. Verilog: str = $sformatf("%p", assoc)
+    std::string to_string() const {
+        if (m_map.empty()) return "'{}";  // No trailing space
+        std::string out = "'{";
+        std::string comma;
+        for (const auto& i : m_map) {
+            out += comma + VL_TO_STRING(i.first) + ":" + VL_TO_STRING(i.second);
+            comma = ", ";
+        }
+        // Default not printed - maybe random init data
+        return out + "} ";
+    }
+};
+
+template <class T_Key, class T_Value>
+std::string VL_TO_STRING(const VlAssocArray<T_Key, T_Value>& obj) {
+    return obj.to_string();
+}
+
+template <class T_Key, class T_Value>
+void VL_READMEM_N(bool hex, int bits, const std::string& filename,
+                  VlAssocArray<T_Key, T_Value>& obj, QData start, QData end) VL_MT_SAFE {
+    VlReadMem rmem{hex, bits, filename, start, end};
+    if (VL_UNLIKELY(!rmem.isOpen())) return;
+    while (true) {
+        QData addr;
+        std::string data;
+        if (rmem.get(addr /*ref*/, data /*ref*/)) {
+            rmem.setData(&(obj.at(addr)), data);
+        } else {
+            break;
+        }
+    }
+}
+
+template <class T_Key, class T_Value>
+void VL_WRITEMEM_N(bool hex, int bits, const std::string& filename,
+                   const VlAssocArray<T_Key, T_Value>& obj, QData start, QData end) VL_MT_SAFE {
+    VlWriteMem wmem{hex, bits, filename, start, end};
+    if (VL_UNLIKELY(!wmem.isOpen())) return;
+    for (const auto& i : obj) {
+        const QData addr = i.first;
+        if (addr >= start && addr <= end) wmem.print(addr, true, &(i.second));
+    }
+}
+
+//===================================================================
+/// Verilog unpacked array container
+/// For when a standard C++[] array is not sufficient, e.g. an
+/// array under a queue, or methods operating on the array.
+///
+/// A 'struct' as we want this to be an aggregate type that allows
+/// static aggregate initialization. Consider data members private.
+///
+/// This class may get exposed to a Verilated Model's top I/O, if the top
+/// IO has an unpacked array.
+
+template <class T_Value, std::size_t T_Depth> struct VlUnpacked final {
+    // MEMBERS
+    // This should be the only data member, otherwise generated static initializers need updating
+    T_Value m_storage[T_Depth];  // Contents of the unpacked array
+
+    // CONSTRUCTORS
+    // Default constructors and destructor are used. Note however that C++20 requires that
+    // aggregate types do not have a user declared constructor, not even an explicitly defaulted
+    // one.
+
+    // OPERATOR METHODS
+    // Default copy assignment operators are used.
+
+    // METHODS
+    // Raw access
+    WData* data() { return &m_storage[0]; }
+    const WData* data() const { return &m_storage[0]; }
+
+    T_Value& operator[](size_t index) { return m_storage[index]; };
+    const T_Value& operator[](size_t index) const { return m_storage[index]; };
+
+    // Dumping. Verilog: str = $sformatf("%p", assoc)
+    std::string to_string() const {
+        std::string out = "'{";
+        std::string comma;
+        for (int i = 0; i < T_Depth; ++i) {
+            out += comma + VL_TO_STRING(m_storage[i]);
+            comma = ", ";
+        }
+        return out + "} ";
+    }
+};
+
+template <class T_Value, std::size_t T_Depth>
+std::string VL_TO_STRING(const VlUnpacked<T_Value, T_Depth>& obj) {
+    return obj.to_string();
+}
+
+//===================================================================
+// Verilog class reference container
+// There are no multithreaded locks on this; the base variable must
+// be protected by other means
+
+#define VlClassRef std::shared_ptr
+
+template <class T>  // T typically of type VlClassRef<x>
+inline T VL_NULL_CHECK(T t, const char* filename, int linenum) {
+    if (VL_UNLIKELY(!t)) Verilated::nullPointerError(filename, linenum);
+    return t;
+}
+
+template <typename T, typename U>
+static inline bool VL_CAST_DYNAMIC(VlClassRef<T> in, VlClassRef<U>& outr) {
+    VlClassRef<U> casted = std::dynamic_pointer_cast<U>(in);
+    if (VL_LIKELY(casted)) {
+        outr = casted;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+//======================================================================
+
+#endif  // Guard
diff --git a/src/V3EmitCConstPool.cpp b/src/V3EmitCConstPool.cpp
index 4fa3b813c..abd50bd41 100644
--- a/src/V3EmitCConstPool.cpp
+++ b/src/V3EmitCConstPool.cpp
@@ -49,7 +49,7 @@ class EmitCConstPool final : public EmitCConstInit {
         ofp->puts("// DESCRIPTION: Verilator output: Constant pool\n");
         ofp->puts("//\n");
         ofp->puts("\n");
-        ofp->puts("#include \"verilated_heavy.h\"\n");
+        ofp->puts("#include \"verilated.h\"\n");
         return ofp;
     }
 
diff --git a/src/V3EmitCHeaders.cpp b/src/V3EmitCHeaders.cpp
index 2ef0903c3..74dce02c8 100644
--- a/src/V3EmitCHeaders.cpp
+++ b/src/V3EmitCHeaders.cpp
@@ -303,7 +303,7 @@ class EmitCHeader final : public EmitCConstInit {
         // Include files
         puts("\n");
         ofp()->putsIntTopInclude();
-        puts("#include \"verilated_heavy.h\"\n");
+        puts("#include \"verilated.h\"\n");
         if (v3Global.opt.mtasks()) puts("#include \"verilated_threads.h\"\n");
         if (v3Global.opt.savable()) puts("#include \"verilated_save.h\"\n");
         if (v3Global.opt.coverage()) puts("#include \"verilated_cov.h\"\n");
diff --git a/src/V3EmitCImp.cpp b/src/V3EmitCImp.cpp
index 69b5f776e..bc5627cf4 100644
--- a/src/V3EmitCImp.cpp
+++ b/src/V3EmitCImp.cpp
@@ -181,7 +181,7 @@ class EmitCImp final : EmitCFunc {
         puts("// See " + topClassName() + ".h for the primary calling header\n");
 
         // Include files
-        puts("\n#include \"verilated_heavy.h\"\n");
+        puts("\n#include \"verilated.h\"\n");
         if (v3Global.dpi()) puts("#include \"verilated_dpi.h\"\n");
         puts("\n");
         for (const string& name : headers) puts("#include \"" + name + ".h\"\n");
diff --git a/src/V3EmitCModel.cpp b/src/V3EmitCModel.cpp
index 965cf56c0..140f03140 100644
--- a/src/V3EmitCModel.cpp
+++ b/src/V3EmitCModel.cpp
@@ -56,7 +56,7 @@ class EmitCModel final : public EmitCFunc {
         // Include files
         puts("\n");
         ofp()->putsIntTopInclude();
-        puts("#include \"verilated_heavy.h\"\n");
+        puts("#include \"verilated.h\"\n");
         if (v3Global.opt.mtasks()) puts("#include \"verilated_threads.h\"\n");
         if (v3Global.opt.savable()) puts("#include \"verilated_save.h\"\n");
         if (v3Global.opt.coverage()) puts("#include \"verilated_cov.h\"\n");
diff --git a/src/V3EmitCSyms.cpp b/src/V3EmitCSyms.cpp
index 7cd836c91..f86dd7487 100644
--- a/src/V3EmitCSyms.cpp
+++ b/src/V3EmitCSyms.cpp
@@ -389,7 +389,7 @@ void EmitCSyms::emitSymHdr() {
 
     puts("\n");
     ofp()->putsIntTopInclude();
-    puts("#include \"verilated_heavy.h\"\n");
+    puts("#include \"verilated.h\"\n");
     if (v3Global.needTraceDumper()) {
         puts("#include \"" + v3Global.opt.traceSourceLang() + ".h\"\n");
     }
diff --git a/test_regress/t/t_verilated_all.pl b/test_regress/t/t_verilated_all.pl
index 1d8370346..44b2d3a72 100755
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@@ -53,6 +53,7 @@ foreach my $file (sort keys %hit) {
     if (!$hit{$file}
         && $file !~ /_sc/
         && $file !~ /_fst/
+        && $file !~ /_heavy/
         && ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
         error("Include file not covered by t_verilated_all test: ",$file);
     }