diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp
index c4b1dc077..c54d818d8 100644
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@@ -64,11 +64,13 @@
 
 VerilatedFst::VerilatedFst(void* fst)
     : m_fst(fst)
-    , m_symbolp(NULL) {}
+    , m_symbolp(NULL)
+    , m_strbuf(NULL) {}
 
 VerilatedFst::~VerilatedFst() {
     if (m_fst) fstWriterClose(m_fst);
     if (m_symbolp) VL_DO_CLEAR(delete[] m_symbolp, m_symbolp = NULL);
+    if (m_strbuf) VL_DO_CLEAR(delete[] m_strbuf, m_strbuf = NULL);
 }
 
 void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
@@ -100,6 +102,9 @@ void VerilatedFst::open(const char* filename) VL_MT_UNSAFE {
         }
     }
     m_code2symbol.clear();
+
+    // Allocate string buffer for arrays
+    if (!m_strbuf) { m_strbuf = new char[maxBits() + 32]; }
 }
 
 void VerilatedFst::close() {
@@ -213,25 +218,59 @@ void VerilatedFst::declDouble(vluint32_t code, const char* name, int dtypenum, f
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBit(vluint32_t code, vluint32_t newval) {
+void VerilatedFst::emitBit(vluint32_t code, CData newval) {
     fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval, int bits) {
-    fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval);
+void VerilatedFst::emitCData(vluint32_t code, CData newval, int bits) {
+    char buf[VL_BYTESIZE];
+    cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
-    fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval);
+void VerilatedFst::emitSData(vluint32_t code, SData newval, int bits) {
+    char buf[VL_SHORTSIZE];
+    cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-    fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newvalp);
+void VerilatedFst::emitIData(vluint32_t code, IData newval, int bits) {
+    char buf[VL_IDATASIZE];
+    cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
 }
+
+VL_ATTR_ALWINLINE
+void VerilatedFst::emitQData(vluint32_t code, QData newval, int bits) {
+    char buf[VL_QUADSIZE];
+    cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], buf);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedFst::emitWData(vluint32_t code, const WData* newvalp, int bits) {
+    int words = VL_WORDS_I(bits);
+    char* wp = m_strbuf;
+    // Convert the most significant word
+    const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
+    cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
+    wp += bitsInMSW;
+    // Convert the remaining words
+    while (words > 0) {
+        cvtEDataToStr(wp, newvalp[--words]);
+        wp += VL_EDATASIZE;
+    }
+    fstWriterEmitValueChange(m_fst, m_symbolp[code], m_strbuf);
+}
+
 VL_ATTR_ALWINLINE
 void VerilatedFst::emitFloat(vluint32_t code, float newval) {
     fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
+
 VL_ATTR_ALWINLINE
 void VerilatedFst::emitDouble(vluint32_t code, double newval) {
     fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h
index b80d8ea7a..c2b0605c7 100644
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@@ -51,6 +51,8 @@ private:
     Local2FstDtype m_local2fstdtype;
     std::list<std::string> m_curScope;
     fstHandle* m_symbolp;  ///< same as m_code2symbol, but as an array
+    char* m_strbuf;  ///< String buffer long enough to hold maxBits() chars
+
     // CONSTRUCTORS
     VL_UNCOPYABLE(VerilatedFst);
     void declSymbol(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@@ -69,10 +71,12 @@ protected:
 
     // Implementations of duck-typed methods for VerilatedTrace. These are
     // called from only one place (namely full*) so always inline them.
-    inline void emitBit(vluint32_t code, vluint32_t newval);
-    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
-    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitBit(vluint32_t code, CData newval);
+    inline void emitCData(vluint32_t code, CData newval, int bits);
+    inline void emitSData(vluint32_t code, SData newval, int bits);
+    inline void emitIData(vluint32_t code, IData newval, int bits);
+    inline void emitQData(vluint32_t code, QData newval, int bits);
+    inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
     inline void emitFloat(vluint32_t code, float newval);
     inline void emitDouble(vluint32_t code, double newval);
 
diff --git a/include/verilated_intrinsics.h b/include/verilated_intrinsics.h
new file mode 100644
index 000000000..11b532fd7
--- /dev/null
+++ b/include/verilated_intrinsics.h
@@ -0,0 +1,41 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2003-2020 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//*************************************************************************
+///
+/// \file
+/// \brief Verilator: Common include for target specific intrinsics.
+///
+///     Code using machine specific intrinsics for optimization should
+///     include this header rather than directly including he target
+///     specific headers. We provide macros to check for availability
+///     of instruction sets, and a common mechanism to disable them.
+///
+//*************************************************************************
+
+#ifndef _VERILATED_INTRINSICS_H_
+#define _VERILATED_INTRINSICS_H_ 1  ///< Header Guard
+
+// clang-format off
+
+// Use VL_DISABLE_INTRINSICS to disable all intrinsics based optimization
+#if !defined(VL_DISABLE_INTRINSICS) && !defined(VL_PORTABLE_ONLY)
+# if defined(__SSE2__) && !defined(VL_DISABLE_SSE2)
+#  define VL_HAVE_SSE2 1
+#  include <emmintrin.h>
+# endif
+# if defined(__AVX2__) && defined(VL_HAVE_SSE2) && !defined(VL_DISABLE_AVX2)
+#  define VL_HAVE_AVX2 1
+#  include <immintrin.h>
+# endif
+#endif
+
+// clang-format on
+
+#endif  // Guard
diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index bb0bce25a..471db6b99 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -90,11 +90,13 @@ public:
     enum {
         CHG_BIT_0 = 0x0,
         CHG_BIT_1 = 0x1,
-        CHG_BUS = 0x2,
-        CHG_QUAD = 0x3,
-        CHG_ARRAY = 0x4,
-        CHG_FLOAT = 0x5,
-        CHG_DOUBLE = 0x6,
+        CHG_CDATA = 0x2,
+        CHG_SDATA = 0x3,
+        CHG_IDATA = 0x4,
+        CHG_QDATA = 0x5,
+        CHG_WDATA = 0x6,
+        CHG_FLOAT = 0x7,
+        CHG_DOUBLE = 0x8,
         // TODO: full..
         TIME_CHANGE = 0xd,
         END = 0xe,  // End of buffer
@@ -122,6 +124,7 @@ private:
     bool m_fullDump;  ///< Whether a full dump is required on the next call to 'dump'
     vluint32_t m_nextCode;  ///< Next code number to assign
     vluint32_t m_numSignals;  ///< Number of distinct signals
+    vluint32_t m_maxBits;  ///< Number of bits in the widest signal
     std::string m_moduleName;  ///< Name of module being trace initialized now
     char m_scopeEscape;
     double m_timeRes;  ///< Time resolution (ns/ms etc)
@@ -176,6 +179,7 @@ protected:
 
     vluint32_t nextCode() const { return m_nextCode; }
     vluint32_t numSignals() const { return m_numSignals; }
+    vluint32_t maxBits() const { return m_maxBits; }
     const std::string& moduleName() const { return m_moduleName; }
     void fullDump(bool value) { m_fullDump = value; }
     vluint64_t timeLastDump() { return m_timeLastDump; }
@@ -251,47 +255,65 @@ public:
     // these here, but we cannot afford dynamic dispatch for calling these as
     // this is very hot code during tracing.
 
-    // duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0;
-    // duck-typed void emitBus(vluint32_t code, vluint32_t newval, int bits) = 0;
-    // duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0;
-    // duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0;
+    // duck-typed void emitBit(vluint32_t code, CData newval) = 0;
+    // duck-typed void emitCData(vluint32_t code, CData newval, int bits) = 0;
+    // duck-typed void emitSData(vluint32_t code, SData newval, int bits) = 0;
+    // duck-typed void emitIData(vluint32_t code, IData newval, int bits) = 0;
+    // duck-typed void emitQData(vluint32_t code, QData newval, int bits) = 0;
+    // duck-typed void emitWData(vluint32_t code, const WData* newvalp, int bits) = 0;
     // duck-typed void emitFloat(vluint32_t code, float newval) = 0;
     // duck-typed void emitDouble(vluint32_t code, double newval) = 0;
 
     vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; }
 
     // Write to previous value buffer value and emit trace entry.
-    void fullBit(vluint32_t* oldp, vluint32_t newval);
-    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits);
-    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits);
-    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits);
+    void fullBit(vluint32_t* oldp, CData newval);
+    void fullCData(vluint32_t* oldp, CData newval, int bits);
+    void fullSData(vluint32_t* oldp, SData newval, int bits);
+    void fullIData(vluint32_t* oldp, IData newval, int bits);
+    void fullQData(vluint32_t* oldp, QData newval, int bits);
+    void fullWData(vluint32_t* oldp, const WData* newvalp, int bits);
     void fullFloat(vluint32_t* oldp, float newval);
     void fullDouble(vluint32_t* oldp, double newval);
 
 #ifdef VL_TRACE_THREADED
     // Threaded tracing. Just dump everything in the trace buffer
-    inline void chgBit(vluint32_t code, vluint32_t newval) {
+    inline void chgBit(vluint32_t code, CData newval) {
         m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval;
         m_traceBufferWritep[1] = code;
         m_traceBufferWritep += 2;
         VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
     }
-    inline void chgBus(vluint32_t code, vluint32_t newval, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS;
+    inline void chgCData(vluint32_t code, CData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_CDATA;
         m_traceBufferWritep[1] = code;
         m_traceBufferWritep[2] = newval;
         m_traceBufferWritep += 3;
         VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
     }
-    inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD;
+    inline void chgSData(vluint32_t code, SData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_SDATA;
         m_traceBufferWritep[1] = code;
-        *reinterpret_cast<vluint64_t*>(m_traceBufferWritep + 2) = newval;
+        m_traceBufferWritep[2] = newval;
+        m_traceBufferWritep += 3;
+        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
+    }
+    inline void chgIData(vluint32_t code, IData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_IDATA;
+        m_traceBufferWritep[1] = code;
+        m_traceBufferWritep[2] = newval;
+        m_traceBufferWritep += 3;
+        VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
+    }
+    inline void chgQData(vluint32_t code, QData newval, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QDATA;
+        m_traceBufferWritep[1] = code;
+        *reinterpret_cast<QData*>(m_traceBufferWritep + 2) = newval;
         m_traceBufferWritep += 4;
         VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp););
     }
-    inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY;
+    inline void chgWData(vluint32_t code, const WData* newvalp, int bits) {
+        m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_WDATA;
         m_traceBufferWritep[1] = code;
         m_traceBufferWritep += 2;
         for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; }
@@ -324,22 +346,30 @@ public:
     // thread and are called chg*Impl
 
     // Check previous dumped value of signal. If changed, then emit trace entry
-    inline void CHG(Bit)(vluint32_t* oldp, vluint32_t newval) {
+    inline void CHG(Bit)(vluint32_t* oldp, CData newval) {
         const vluint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
     }
-    inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval, int bits) {
+    inline void CHG(CData)(vluint32_t* oldp, CData newval, int bits) {
         const vluint32_t diff = *oldp ^ newval;
-        if (VL_UNLIKELY(diff)) fullBus(oldp, newval, bits);
+        if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
     }
-    inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) {
-        const vluint64_t diff = *reinterpret_cast<vluint64_t*>(oldp) ^ newval;
-        if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits);
+    inline void CHG(SData)(vluint32_t* oldp, SData newval, int bits) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
     }
-    inline void CHG(Array)(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    inline void CHG(IData)(vluint32_t* oldp, IData newval, int bits) {
+        const vluint32_t diff = *oldp ^ newval;
+        if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
+    }
+    inline void CHG(QData)(vluint32_t* oldp, QData newval, int bits) {
+        const vluint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
+        if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
+    }
+    inline void CHG(WData)(vluint32_t* oldp, const WData* newvalp, int bits) {
         for (int i = 0; i < (bits + 31) / 32; ++i) {
             if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
-                fullArray(oldp, newvalp, bits);
+                fullWData(oldp, newvalp, bits);
                 return;
             }
         }
diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.cpp
index 2e5345d65..78dc19a11 100644
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.cpp
@@ -23,6 +23,7 @@
 # error "This file should be included in trace format implementations"
 #endif
 
+#include "verilated_intrinsics.h"
 #include "verilated_trace.h"
 
 #if 0
@@ -166,22 +167,34 @@ template <> void VerilatedTrace<VL_DERIVED_T>::workerThreadMain() {
                 VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top);
                 chgBitImpl(oldp, 1);
                 continue;
-            case VerilatedTraceCommand::CHG_BUS:
-                VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top);
+            case VerilatedTraceCommand::CHG_CDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_CDATA " << top);
                 // Bits stored in bottom byte of command
-                chgBusImpl(oldp, *readp, top);
+                chgCDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
-            case VerilatedTraceCommand::CHG_QUAD:
-                VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top);
+            case VerilatedTraceCommand::CHG_SDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_SDATA " << top);
                 // Bits stored in bottom byte of command
-                chgQuadImpl(oldp, *reinterpret_cast<const vluint64_t*>(readp), top);
+                chgSDataImpl(oldp, *readp, top);
+                readp += 1;
+                continue;
+            case VerilatedTraceCommand::CHG_IDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_IDATA " << top);
+                // Bits stored in bottom byte of command
+                chgIDataImpl(oldp, *readp, top);
+                readp += 1;
+                continue;
+            case VerilatedTraceCommand::CHG_QDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_QDATA " << top);
+                // Bits stored in bottom byte of command
+                chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                 readp += 2;
                 continue;
-            case VerilatedTraceCommand::CHG_ARRAY:
-                VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top);
-                chgArrayImpl(oldp, readp, top);
-                readp += (top + 31) / 32;
+            case VerilatedTraceCommand::CHG_WDATA:
+                VL_TRACE_THREAD_DEBUG("Command CHG_WDATA " << top);
+                chgWDataImpl(oldp, readp, top);
+                readp += VL_WORDS_I(top);
                 continue;
             case VerilatedTraceCommand::CHG_FLOAT:
                 VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top);
@@ -284,6 +297,7 @@ VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
     , m_fullDump(true)
     , m_nextCode(0)
     , m_numSignals(0)
+    , m_maxBits(0)
     , m_scopeEscape('.')
     , m_timeRes(1e-9)
     , m_timeUnit(1e-9)
@@ -318,6 +332,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
     const vluint32_t expectedCodes = nextCode();
     m_nextCode = 1;
     m_numSignals = 0;
+    m_maxBits = 0;
 
     // Call all initialize callbacks, which will call decl* for each signal.
     for (vluint32_t ent = 0; ent < m_callbacks.size(); ++ent) {
@@ -355,10 +370,11 @@ void VerilatedTrace<VL_DERIVED_T>::declCode(vluint32_t code, vluint32_t bits, bo
     }
     // Note: The tri-state flag is not used by Verilator, but is here for
     // compatibility with some foreign code.
-    int codesNeeded = (bits + 31) / 32;
+    int codesNeeded = VL_WORDS_I(bits);
     if (tri) codesNeeded *= 2;
     m_nextCode = std::max(m_nextCode, code + codesNeeded);
     ++m_numSignals;
+    m_maxBits = std::max(m_maxBits, bits);
 }
 
 //=========================================================================
@@ -486,35 +502,139 @@ void VerilatedTrace<VL_DERIVED_T>::addCallback(callback_t initcb, callback_t ful
 // that this file must be included in the format specific implementation, so
 // the emit* functions can be inlined for performance.
 
-template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, vluint32_t newval) {
+template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(vluint32_t* oldp, CData newval) {
     *oldp = newval;
     self()->emitBit(oldp - m_sigs_oldvalp, newval);
 }
 
 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+void VerilatedTrace<VL_DERIVED_T>::fullCData(vluint32_t* oldp, CData newval, int bits) {
     *oldp = newval;
-    self()->emitBus(oldp - m_sigs_oldvalp, newval, bits);
+    self()->emitCData(oldp - m_sigs_oldvalp, newval, bits);
 }
 
 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
-    *reinterpret_cast<vluint64_t*>(oldp) = newval;
-    self()->emitQuad(oldp - m_sigs_oldvalp, newval, bits);
+void VerilatedTrace<VL_DERIVED_T>::fullSData(vluint32_t* oldp, SData newval, int bits) {
+    *oldp = newval;
+    self()->emitSData(oldp - m_sigs_oldvalp, newval, bits);
 }
+
 template <>
-void VerilatedTrace<VL_DERIVED_T>::fullArray(vluint32_t* oldp, const vluint32_t* newvalp,
-                                             int bits) {
-    for (int i = 0; i < (bits + 31) / 32; ++i) oldp[i] = newvalp[i];
-    self()->emitArray(oldp - m_sigs_oldvalp, newvalp, bits);
+void VerilatedTrace<VL_DERIVED_T>::fullIData(vluint32_t* oldp, IData newval, int bits) {
+    *oldp = newval;
+    self()->emitIData(oldp - m_sigs_oldvalp, newval, bits);
 }
+
+template <>
+void VerilatedTrace<VL_DERIVED_T>::fullQData(vluint32_t* oldp, QData newval, int bits) {
+    *reinterpret_cast<QData*>(oldp) = newval;
+    self()->emitQData(oldp - m_sigs_oldvalp, newval, bits);
+}
+
+template <>
+void VerilatedTrace<VL_DERIVED_T>::fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
+    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
+    self()->emitWData(oldp - m_sigs_oldvalp, newvalp, bits);
+}
+
 template <> void VerilatedTrace<VL_DERIVED_T>::fullFloat(vluint32_t* oldp, float newval) {
     // cppcheck-suppress invalidPointerCast
     *reinterpret_cast<float*>(oldp) = newval;
     self()->emitFloat(oldp - m_sigs_oldvalp, newval);
 }
+
 template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(vluint32_t* oldp, double newval) {
     // cppcheck-suppress invalidPointerCast
     *reinterpret_cast<double*>(oldp) = newval;
     self()->emitDouble(oldp - m_sigs_oldvalp, newval);
 }
+
+//=========================================================================
+// Primitives converting binary values to strings...
+
+// All of these take a destination pointer where the string will be emitted,
+// and a value to convert. There are a couple of variants for efficiency.
+
+inline static void cvtCDataToStr(char* dstp, CData value) {
+#ifdef VL_HAVE_SSE2
+    // Similar to cvtSDataToStr but only the bottom 8 byte lanes are used
+    const __m128i a = _mm_cvtsi32_si128(value);
+    const __m128i b = _mm_unpacklo_epi8(a, a);
+    const __m128i c = _mm_shufflelo_epi16(b, 0);
+    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
+    const __m128i d = _mm_cmpeq_epi8(_mm_and_si128(c, m), m);
+    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), d);
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp), result);
+#else
+    dstp[0] = '0' | static_cast<char>((value >> 7) & 1);
+    dstp[1] = '0' | static_cast<char>((value >> 6) & 1);
+    dstp[2] = '0' | static_cast<char>((value >> 5) & 1);
+    dstp[3] = '0' | static_cast<char>((value >> 4) & 1);
+    dstp[4] = '0' | static_cast<char>((value >> 3) & 1);
+    dstp[5] = '0' | static_cast<char>((value >> 2) & 1);
+    dstp[6] = '0' | static_cast<char>((value >> 1) & 1);
+    dstp[7] = '0' | static_cast<char>(value & 1);
+#endif
+}
+
+inline static void cvtSDataToStr(char* dstp, SData value) {
+#ifdef VL_HAVE_SSE2
+    // We want each bit in the 16-bit input value to end up in a byte lane
+    // within the 128-bit XMM register. Note that x86 is little-endian and we
+    // want the MSB of the input at the low address, so we will bit-reverse
+    // at the same time.
+
+    // Put value in bottom of 128-bit register a[15:0] = value
+    const __m128i a = _mm_cvtsi32_si128(value);
+    // Interleave bytes with themselves
+    // b[15: 0] = {2{a[ 7:0]}} == {2{value[ 7:0]}}
+    // b[31:16] = {2{a[15:8]}} == {2{value[15:8]}}
+    const __m128i b = _mm_unpacklo_epi8(a, a);
+    // Shuffle bottom 64 bits, note swapping high bytes with low bytes
+    // c[31: 0] = {2{b[31:16]}} == {4{value[15:8}}
+    // c[63:32] = {2{b[15: 0]}} == {4{value[ 7:0}}
+    const __m128i c = _mm_shufflelo_epi16(b, 0x05);
+    // Shuffle whole register
+    // d[ 63: 0] = {2{c[31: 0]}} == {8{value[15:8}}
+    // d[126:54] = {2{c[63:32]}} == {8{value[ 7:0}}
+    const __m128i d = _mm_shuffle_epi32(c, 0x50);
+    // Test each bit within the bytes, this sets each byte lane to 0
+    // if the bit for that lane is 0 and to 0xff if the bit is 1.
+    const __m128i m = _mm_set1_epi64x(0x0102040810204080);
+    const __m128i e = _mm_cmpeq_epi8(_mm_and_si128(d, m), m);
+    // Convert to ASCII by subtracting the masks from ASCII '0':
+    // '0' - 0 is '0', '0' - -1 is '1'
+    const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), e);
+    // Store the 16 characters to the un-aligned buffer
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstp), result);
+#else
+    cvtCDataToStr(dstp, value >> 8);
+    cvtCDataToStr(dstp + 8, value);
+#endif
+}
+
+inline static void cvtIDataToStr(char* dstp, IData value) {
+#ifdef VL_HAVE_AVX2
+    // Similar to cvtSDataToStr but the bottom 16-bits are processed in the
+    // top half of the YMM registerss
+    const __m256i a = _mm256_insert_epi32(_mm256_undefined_si256(), value, 0);
+    const __m256i b = _mm256_permute4x64_epi64(a, 0);
+    const __m256i s = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
+                                      2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
+    const __m256i c = _mm256_shuffle_epi8(b, s);
+    const __m256i m = _mm256_set1_epi64x(0x0102040810204080);
+    const __m256i d = _mm256_cmpeq_epi8(_mm256_and_si256(c, m), m);
+    const __m256i result = _mm256_sub_epi8(_mm256_set1_epi8('0'), d);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp), result);
+#else
+    cvtSDataToStr(dstp, value >> 16);
+    cvtSDataToStr(dstp + 16, value);
+#endif
+}
+
+inline static void cvtQDataToStr(char* dstp, QData value) {
+    cvtIDataToStr(dstp, value >> 32);
+    cvtIDataToStr(dstp + 32, value);
+}
+
+#define cvtEDataToStr cvtIDataToStr
diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp
index e1ffd2293..e940ee789 100644
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@@ -35,6 +35,8 @@
 # include <unistd.h>
 #endif
 
+#include "verilated_intrinsics.h"
+
 // SPDIFF_ON
 
 #ifndef O_LARGEFILE  // For example on WIN32
@@ -606,20 +608,14 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i
 #endif  //  VL_TRACE_VCD_OLD_API
 
 //=============================================================================
-// Trace recording routines
+// Trace rendering prinitives
 
-//=============================================================================
-// Emit trace entries
-
-#define VL_VCD_SUFFIXP(code) (m_suffixesp + (code)*VL_TRACE_SUFFIX_ENTRY_SIZE)
-
-// Emit suffix, write back write pointer, check buffer
 void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
-    const char* const suffixp = VL_VCD_SUFFIXP(code);
+    const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
     // Copy the whole suffix (this avoid having hard to predict branches which
     // helps a lot). Note: The maximum length of the suffix is
     // VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed.
-#ifdef __x86_64__
+#ifdef VL_X86_64
     // Copy the whole 8 bytes in one go, this works on little-endian machines
     // supporting unaligned stores.
     *reinterpret_cast<vluint64_t*>(writep) = *reinterpret_cast<const vluint64_t*>(suffixp);
@@ -639,12 +635,15 @@ void VerilatedVcd::finishLine(vluint32_t code, char* writep) {
     bufferCheck();
 }
 
+//=============================================================================
+// emit* trace routines
+
 // Note: emit* are only ever called from one place (full* in
 // verilated_trace_imp.cpp, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
+void VerilatedVcd::emitBit(vluint32_t code, CData newval) {
     // Don't prefetch suffix as it's a bit too late;
     char* wp = m_writep;
     *wp++ = '0' | static_cast<char>(newval);
@@ -652,182 +651,56 @@ void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
+void VerilatedVcd::emitCData(vluint32_t code, CData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
-    wp += bits;
-    // clang-format off
-    switch (bits) {
-    case 32: wp[-32] = '0' | static_cast<char>((newval >> 31)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((newval >> 30) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((newval >> 29) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((newval >> 28) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((newval >> 27) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((newval >> 26) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((newval >> 25) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((newval >> 24) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((newval >> 23) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((newval >> 22) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((newval >> 21) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((newval >> 20) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((newval >> 19) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((newval >> 18) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((newval >> 17) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((newval >> 16) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((newval >> 15) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((newval >> 14) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((newval >> 13) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((newval >> 12) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((newval >> 11) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((newval >> 10) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((newval >>  9) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((newval >>  8) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((newval >>  7) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((newval >>  6) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((newval >>  5) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((newval >>  4) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((newval >>  3) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((newval >>  2) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((newval >>  1) & 1); //FALLTHRU
-    /*bit*/  wp[ -1] = '0' | static_cast<char>((newval      ) & 1); //FALLTHRU
-    }
-    // clang-format on
-    finishLine(code, wp);
+    cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
+    finishLine(code, wp + bits);
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitQuad(vluint32_t code, vluint64_t newval, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
+void VerilatedVcd::emitSData(vluint32_t code, SData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
-    // Handle the top 32 bits within the 64 bit input
-    const int bitsInTopHalf = bits - 32;
-    wp += bitsInTopHalf;
-    // clang-format off
-    switch (bitsInTopHalf) {
-    case 32: wp[-32] = '0' | static_cast<char>((newval >> 63)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((newval >> 62) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((newval >> 61) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((newval >> 60) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((newval >> 59) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((newval >> 58) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((newval >> 57) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((newval >> 56) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((newval >> 55) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((newval >> 54) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((newval >> 53) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((newval >> 52) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((newval >> 51) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((newval >> 50) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((newval >> 49) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((newval >> 48) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((newval >> 47) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((newval >> 46) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((newval >> 45) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((newval >> 44) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((newval >> 43) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((newval >> 42) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((newval >> 41) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((newval >> 40) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((newval >> 39) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((newval >> 38) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((newval >> 37) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((newval >> 36) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((newval >> 35) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((newval >> 34) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((newval >> 33) & 1); //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>((newval >> 32) & 1); //FALLTHRU
-    }
-    // clang-format on
-    // Handle the bottom 32 bits within the 64 bit input
-    vluint32_t val = static_cast<vluint32_t>(newval);  // Truncate to bottom 32 bits
-    int loops = 4;
-    do {
-        wp[0] = '0' | static_cast<char>((val >> 31));
-        wp[1] = '0' | static_cast<char>((val >> 30) & 1);
-        wp[2] = '0' | static_cast<char>((val >> 29) & 1);
-        wp[3] = '0' | static_cast<char>((val >> 28) & 1);
-        wp[4] = '0' | static_cast<char>((val >> 27) & 1);
-        wp[5] = '0' | static_cast<char>((val >> 26) & 1);
-        wp[6] = '0' | static_cast<char>((val >> 25) & 1);
-        wp[7] = '0' | static_cast<char>((val >> 24) & 1);
-        wp += 8;
-        val <<= 8;
-    } while (--loops);
-
-    finishLine(code, wp);
+    cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
+    finishLine(code, wp + bits);
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
-    int words = (bits + 31) / 32;
+void VerilatedVcd::emitIData(vluint32_t code, IData newval, int bits) {
+    char* wp = m_writep;
+    *wp++ = 'b';
+    cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
+    finishLine(code, wp + bits);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedVcd::emitQData(vluint32_t code, QData newval, int bits) {
+    char* wp = m_writep;
+    *wp++ = 'b';
+    cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
+    finishLine(code, wp + bits);
+}
+
+VL_ATTR_ALWINLINE
+void VerilatedVcd::emitWData(vluint32_t code, const WData* newvalp, int bits) {
+    int words = VL_WORDS_I(bits);
     char* wp = m_writep;
     *wp++ = 'b';
     // Handle the most significant word
-    vluint32_t val = newvalp[--words];
-    const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32;
+    const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE;
+    cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW));
     wp += bitsInMSW;
-    // clang-format off
-    switch (bitsInMSW) {
-    case 32: wp[-32] = '0' | static_cast<char>((val >> 31)    ); //FALLTHRU
-    case 31: wp[-31] = '0' | static_cast<char>((val >> 30) & 1); //FALLTHRU
-    case 30: wp[-30] = '0' | static_cast<char>((val >> 29) & 1); //FALLTHRU
-    case 29: wp[-29] = '0' | static_cast<char>((val >> 28) & 1); //FALLTHRU
-    case 28: wp[-28] = '0' | static_cast<char>((val >> 27) & 1); //FALLTHRU
-    case 27: wp[-27] = '0' | static_cast<char>((val >> 26) & 1); //FALLTHRU
-    case 26: wp[-26] = '0' | static_cast<char>((val >> 25) & 1); //FALLTHRU
-    case 25: wp[-25] = '0' | static_cast<char>((val >> 24) & 1); //FALLTHRU
-    case 24: wp[-24] = '0' | static_cast<char>((val >> 23) & 1); //FALLTHRU
-    case 23: wp[-23] = '0' | static_cast<char>((val >> 22) & 1); //FALLTHRU
-    case 22: wp[-22] = '0' | static_cast<char>((val >> 21) & 1); //FALLTHRU
-    case 21: wp[-21] = '0' | static_cast<char>((val >> 20) & 1); //FALLTHRU
-    case 20: wp[-20] = '0' | static_cast<char>((val >> 19) & 1); //FALLTHRU
-    case 19: wp[-19] = '0' | static_cast<char>((val >> 18) & 1); //FALLTHRU
-    case 18: wp[-18] = '0' | static_cast<char>((val >> 17) & 1); //FALLTHRU
-    case 17: wp[-17] = '0' | static_cast<char>((val >> 16) & 1); //FALLTHRU
-    case 16: wp[-16] = '0' | static_cast<char>((val >> 15) & 1); //FALLTHRU
-    case 15: wp[-15] = '0' | static_cast<char>((val >> 14) & 1); //FALLTHRU
-    case 14: wp[-14] = '0' | static_cast<char>((val >> 13) & 1); //FALLTHRU
-    case 13: wp[-13] = '0' | static_cast<char>((val >> 12) & 1); //FALLTHRU
-    case 12: wp[-12] = '0' | static_cast<char>((val >> 11) & 1); //FALLTHRU
-    case 11: wp[-11] = '0' | static_cast<char>((val >> 10) & 1); //FALLTHRU
-    case 10: wp[-10] = '0' | static_cast<char>((val >>  9) & 1); //FALLTHRU
-    case 9:  wp[ -9] = '0' | static_cast<char>((val >>  8) & 1); //FALLTHRU
-    case 8:  wp[ -8] = '0' | static_cast<char>((val >>  7) & 1); //FALLTHRU
-    case 7:  wp[ -7] = '0' | static_cast<char>((val >>  6) & 1); //FALLTHRU
-    case 6:  wp[ -6] = '0' | static_cast<char>((val >>  5) & 1); //FALLTHRU
-    case 5:  wp[ -5] = '0' | static_cast<char>((val >>  4) & 1); //FALLTHRU
-    case 4:  wp[ -4] = '0' | static_cast<char>((val >>  3) & 1); //FALLTHRU
-    case 3:  wp[ -3] = '0' | static_cast<char>((val >>  2) & 1); //FALLTHRU
-    case 2:  wp[ -2] = '0' | static_cast<char>((val >>  1) & 1); //FALLTHRU
-    case 1:  wp[ -1] = '0' | static_cast<char>((val      ) & 1); //FALLTHRU
-    }
-    // clang-format on
     // Handle the remaining words
     while (words > 0) {
-        vluint32_t val = newvalp[--words];
-        int loops = 4;
-        do {
-            wp[0] = '0' | static_cast<char>((val >> 31));
-            wp[1] = '0' | static_cast<char>((val >> 30) & 1);
-            wp[2] = '0' | static_cast<char>((val >> 29) & 1);
-            wp[3] = '0' | static_cast<char>((val >> 28) & 1);
-            wp[4] = '0' | static_cast<char>((val >> 27) & 1);
-            wp[5] = '0' | static_cast<char>((val >> 26) & 1);
-            wp[6] = '0' | static_cast<char>((val >> 25) & 1);
-            wp[7] = '0' | static_cast<char>((val >> 24) & 1);
-            wp += 8;
-            val <<= 8;
-        } while (--loops);
+        cvtEDataToStr(wp, newvalp[--words]);
+        wp += VL_EDATASIZE;
     }
     finishLine(code, wp);
 }
 
 VL_ATTR_ALWINLINE
 void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
     char* wp = m_writep;
     // Buffer can't overflow before sprintf; we sized during declaration
     sprintf(wp, "r%.16g", static_cast<double>(newval));
@@ -837,7 +710,6 @@ void VerilatedVcd::emitFloat(vluint32_t code, float newval) {
 
 VL_ATTR_ALWINLINE
 void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
-    VL_PREFETCH_RD(VL_VCD_SUFFIXP(code));
     char* wp = m_writep;
     // Buffer can't overflow before sprintf; we sized during declaration
     sprintf(wp, "r%.16g", newval);
@@ -845,8 +717,6 @@ void VerilatedVcd::emitDouble(vluint32_t code, double newval) {
     finishLine(code, wp);
 }
 
-#undef VL_VCD_SUFFIXP
-
 #ifdef VL_TRACE_VCD_OLD_API
 
 void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) {
diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h
index d9049366f..972c8c0e7 100644
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@@ -124,10 +124,12 @@ protected:
 
     // Implementations of duck-typed methods for VerilatedTrace. These are
     // called from only one place (namely full*) so always inline them.
-    inline void emitBit(vluint32_t code, vluint32_t newval);
-    inline void emitBus(vluint32_t code, vluint32_t newval, int bits);
-    inline void emitQuad(vluint32_t code, vluint64_t newval, int bits);
-    inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits);
+    inline void emitBit(vluint32_t code, CData newval);
+    inline void emitCData(vluint32_t code, CData newval, int bits);
+    inline void emitSData(vluint32_t code, SData newval, int bits);
+    inline void emitIData(vluint32_t code, IData newval, int bits);
+    inline void emitQData(vluint32_t code, QData newval, int bits);
+    inline void emitWData(vluint32_t code, const WData* newvalp, int bits);
     inline void emitFloat(vluint32_t code, float newval);
     inline void emitDouble(vluint32_t code, double newval);
 
@@ -176,37 +178,48 @@ public:
                      int lsb);
     void declTriArray(vluint32_t code, const char* name, bool array, int arraynum, int msb,
                       int lsb);
-    //=========================================================================
-    // Write back to previous value buffer value and emit
 
-    void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - this->oldp(0), newval); }
-    void fullBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+    void fullBit(vluint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
+    void fullCData(vluint32_t* oldp, CData newval, int bits) {
         fullBus(oldp - this->oldp(0), newval, bits);
     }
-    void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+    void fullSData(vluint32_t* oldp, SData newval, int bits) {
+        fullBus(oldp - this->oldp(0), newval, bits);
+    }
+    void fullIData(vluint32_t* oldp, IData newval, int bits) {
+        fullBus(oldp - this->oldp(0), newval, bits);
+    }
+    void fullQData(vluint32_t* oldp, QData newval, int bits) {
         fullQuad(oldp - this->oldp(0), newval, bits);
     }
-    void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    void fullWData(vluint32_t* oldp, const WData* newvalp, int bits) {
         fullArray(oldp - this->oldp(0), newvalp, bits);
     }
     void fullFloat(vluint32_t* oldp, float newval) { fullFloat(oldp - this->oldp(0), newval); }
     void fullDouble(vluint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }
 
-    //=========================================================================
-    // Check previous value and emit if changed
-
-    void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - this->oldp(0), newval); }
-    void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) {
+    inline void chgBit(vluint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
+    inline void chgCData(vluint32_t* oldp, CData newval, int bits) {
         chgBus(oldp - this->oldp(0), newval, bits);
     }
-    void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) {
+    inline void chgSData(vluint32_t* oldp, SData newval, int bits) {
+        chgBus(oldp - this->oldp(0), newval, bits);
+    }
+    inline void chgIData(vluint32_t* oldp, IData newval, int bits) {
+        chgBus(oldp - this->oldp(0), newval, bits);
+    }
+    inline void chgQData(vluint32_t* oldp, QData newval, int bits) {
         chgQuad(oldp - this->oldp(0), newval, bits);
     }
-    void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) {
+    inline void chgWData(vluint32_t* oldp, const WData* newvalp, int bits) {
         chgArray(oldp - this->oldp(0), newvalp, bits);
     }
-    void chgFloat(vluint32_t* oldp, float newval) { chgFloat(oldp - this->oldp(0), newval); }
-    void chgDouble(vluint32_t* oldp, double newval) { chgDouble(oldp - this->oldp(0), newval); }
+    inline void chgFloat(vluint32_t* oldp, float newval) {
+        chgFloat(oldp - this->oldp(0), newval);
+    }
+    inline void chgDouble(vluint32_t* oldp, double newval) {
+        chgDouble(oldp - this->oldp(0), newval);
+    }
 
     /// Inside dumping routines, dump one signal, faster when not inlined
     /// due to code size reduction.
diff --git a/include/verilatedos.h b/include/verilatedos.h
index 8f689190b..352f7f87b 100644
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@@ -475,6 +475,16 @@ typedef unsigned long long vluint64_t;  ///< 64-bit unsigned type
 #else
 # define VL_STRCASECMP strcasecmp
 #endif
+
+//=========================================================================
+// Macros controlling target specific optimizations
+
+// Define VL_PORTABLE_ONLY to disable all target specific optimizations
+#ifndef VL_PORTABLE_ONLY
+# ifdef __x86_64__
+#  define VL_X86_64 1
+# endif
+#endif // VL_PORTABLE_ONLY
 // clang-format on
 
 //=========================================================================
diff --git a/src/V3EmitC.cpp b/src/V3EmitC.cpp
index 061e0b1b3..d31156ccf 100644
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@@ -3550,20 +3550,23 @@ class EmitCTrace : EmitCStmts {
         const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL
                            || m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB);
         const string func = full ? "full" : "chg";
-        bool emitWidth = false;
+        bool emitWidth = true;
         if (nodep->dtypep()->basicp()->isDouble()) {
             puts("vcdp->" + func + "Double");
+            emitWidth = false;
         } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("vcdp->" + func + "Array");
-            emitWidth = true;
+            puts("vcdp->" + func + "WData");
         } else if (nodep->isQuad()) {
-            puts("vcdp->" + func + "Quad");
-            emitWidth = true;
+            puts("vcdp->" + func + "QData");
+        } else if (nodep->declp()->widthMin() > 16) {
+            puts("vcdp->" + func + "IData");
+        } else if (nodep->declp()->widthMin() > 8) {
+            puts("vcdp->" + func + "SData");
         } else if (nodep->declp()->widthMin() > 1) {
-            puts("vcdp->" + func + "Bus");
-            emitWidth = true;
+            puts("vcdp->" + func + "CData");
         } else {
             puts("vcdp->" + func + "Bit");
+            emitWidth = false;
         }
 
         const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
diff --git a/test_regress/t/t_trace_array_fst_portable.pl b/test_regress/t/t_trace_array_fst_portable.pl
new file mode 100755
index 000000000..0b15dea3c
--- /dev/null
+++ b/test_regress/t/t_trace_array_fst_portable.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vlt => 1);
+
+top_filename("t/t_trace_array.v");
+$Self->{golden_filename} = "t/t_trace_array_fst.out";
+
+compile(
+    verilator_flags2 => ['--cc --trace-fst --trace-structs',
+                         '-CFLAGS -DVL_PORTABLE_ONLY'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+fst_identical($Self->trace_filename, $Self->{golden_filename});
+
+ok(1);
+1;
diff --git a/test_regress/t/t_trace_complex_portable.pl b/test_regress/t/t_trace_complex_portable.pl
new file mode 100755
index 000000000..b168579a4
--- /dev/null
+++ b/test_regress/t/t_trace_complex_portable.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+# Same test as t_trace_complex, but exercising the old VCD tracing API
+
+scenarios(vlt => 1);
+
+top_filename("t/t_trace_complex.v");
+
+compile(
+    verilator_flags2 => ['--cc --trace -CFLAGS -DVL_PORTABLE_ONLY'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\(/);
+file_grep     ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\(/);
+
+vcd_identical ("$Self->{obj_dir}/simx.vcd", "t/t_trace_complex.out");
+
+ok(1);
+1;