diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp index c4b1dc077..c54d818d8 100644 --- a/include/verilated_fst_c.cpp +++ b/include/verilated_fst_c.cpp @@ -64,11 +64,13 @@ VerilatedFst::VerilatedFst(void* fst) : m_fst(fst) - , m_symbolp(NULL) {} + , m_symbolp(NULL) + , m_strbuf(NULL) {} VerilatedFst::~VerilatedFst() { if (m_fst) fstWriterClose(m_fst); if (m_symbolp) VL_DO_CLEAR(delete[] m_symbolp, m_symbolp = NULL); + if (m_strbuf) VL_DO_CLEAR(delete[] m_strbuf, m_strbuf = NULL); } void VerilatedFst::open(const char* filename) VL_MT_UNSAFE { @@ -100,6 +102,9 @@ void VerilatedFst::open(const char* filename) VL_MT_UNSAFE { } } m_code2symbol.clear(); + + // Allocate string buffer for arrays + if (!m_strbuf) { m_strbuf = new char[maxBits() + 32]; } } void VerilatedFst::close() { @@ -213,25 +218,59 @@ void VerilatedFst::declDouble(vluint32_t code, const char* name, int dtypenum, f // so always inline them. VL_ATTR_ALWINLINE -void VerilatedFst::emitBit(vluint32_t code, vluint32_t newval) { +void VerilatedFst::emitBit(vluint32_t code, CData newval) { fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0"); } + VL_ATTR_ALWINLINE -void VerilatedFst::emitBus(vluint32_t code, vluint32_t newval, int bits) { - fstWriterEmitValueChange32(m_fst, m_symbolp[code], bits, newval); +void VerilatedFst::emitCData(vluint32_t code, CData newval, int bits) { + char buf[VL_BYTESIZE]; + cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits)); + fstWriterEmitValueChange(m_fst, m_symbolp[code], buf); } + VL_ATTR_ALWINLINE -void VerilatedFst::emitQuad(vluint32_t code, vluint64_t newval, int bits) { - fstWriterEmitValueChange64(m_fst, m_symbolp[code], bits, newval); +void VerilatedFst::emitSData(vluint32_t code, SData newval, int bits) { + char buf[VL_SHORTSIZE]; + cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits)); + fstWriterEmitValueChange(m_fst, m_symbolp[code], buf); } + VL_ATTR_ALWINLINE -void VerilatedFst::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) { - fstWriterEmitValueChangeVec32(m_fst, m_symbolp[code], bits, newvalp); +void VerilatedFst::emitIData(vluint32_t code, IData newval, int bits) { + char buf[VL_IDATASIZE]; + cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits)); + fstWriterEmitValueChange(m_fst, m_symbolp[code], buf); } + +VL_ATTR_ALWINLINE +void VerilatedFst::emitQData(vluint32_t code, QData newval, int bits) { + char buf[VL_QUADSIZE]; + cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits)); + fstWriterEmitValueChange(m_fst, m_symbolp[code], buf); +} + +VL_ATTR_ALWINLINE +void VerilatedFst::emitWData(vluint32_t code, const WData* newvalp, int bits) { + int words = VL_WORDS_I(bits); + char* wp = m_strbuf; + // Convert the most significant word + const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE; + cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW)); + wp += bitsInMSW; + // Convert the remaining words + while (words > 0) { + cvtEDataToStr(wp, newvalp[--words]); + wp += VL_EDATASIZE; + } + fstWriterEmitValueChange(m_fst, m_symbolp[code], m_strbuf); +} + VL_ATTR_ALWINLINE void VerilatedFst::emitFloat(vluint32_t code, float newval) { fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval); } + VL_ATTR_ALWINLINE void VerilatedFst::emitDouble(vluint32_t code, double newval) { fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval); diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h index b80d8ea7a..c2b0605c7 100644 --- a/include/verilated_fst_c.h +++ b/include/verilated_fst_c.h @@ -51,6 +51,8 @@ private: Local2FstDtype m_local2fstdtype; std::list m_curScope; fstHandle* m_symbolp; ///< same as m_code2symbol, but as an array + char* m_strbuf; ///< String buffer long enough to hold maxBits() chars + // CONSTRUCTORS VL_UNCOPYABLE(VerilatedFst); void declSymbol(vluint32_t code, const char* name, int dtypenum, fstVarDir vardir, @@ -69,10 +71,12 @@ protected: // Implementations of duck-typed methods for VerilatedTrace. These are // called from only one place (namely full*) so always inline them. - inline void emitBit(vluint32_t code, vluint32_t newval); - inline void emitBus(vluint32_t code, vluint32_t newval, int bits); - inline void emitQuad(vluint32_t code, vluint64_t newval, int bits); - inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits); + inline void emitBit(vluint32_t code, CData newval); + inline void emitCData(vluint32_t code, CData newval, int bits); + inline void emitSData(vluint32_t code, SData newval, int bits); + inline void emitIData(vluint32_t code, IData newval, int bits); + inline void emitQData(vluint32_t code, QData newval, int bits); + inline void emitWData(vluint32_t code, const WData* newvalp, int bits); inline void emitFloat(vluint32_t code, float newval); inline void emitDouble(vluint32_t code, double newval); diff --git a/include/verilated_intrinsics.h b/include/verilated_intrinsics.h new file mode 100644 index 000000000..11b532fd7 --- /dev/null +++ b/include/verilated_intrinsics.h @@ -0,0 +1,41 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// +// Copyright 2003-2020 by Wilson Snyder. This program is free software; you can +// redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 +// +//************************************************************************* +/// +/// \file +/// \brief Verilator: Common include for target specific intrinsics. +/// +/// Code using machine specific intrinsics for optimization should +/// include this header rather than directly including he target +/// specific headers. We provide macros to check for availability +/// of instruction sets, and a common mechanism to disable them. +/// +//************************************************************************* + +#ifndef _VERILATED_INTRINSICS_H_ +#define _VERILATED_INTRINSICS_H_ 1 ///< Header Guard + +// clang-format off + +// Use VL_DISABLE_INTRINSICS to disable all intrinsics based optimization +#if !defined(VL_DISABLE_INTRINSICS) && !defined(VL_PORTABLE_ONLY) +# if defined(__SSE2__) && !defined(VL_DISABLE_SSE2) +# define VL_HAVE_SSE2 1 +# include +# endif +# if defined(__AVX2__) && defined(VL_HAVE_SSE2) && !defined(VL_DISABLE_AVX2) +# define VL_HAVE_AVX2 1 +# include +# endif +#endif + +// clang-format on + +#endif // Guard diff --git a/include/verilated_trace.h b/include/verilated_trace.h index bb0bce25a..471db6b99 100644 --- a/include/verilated_trace.h +++ b/include/verilated_trace.h @@ -90,11 +90,13 @@ public: enum { CHG_BIT_0 = 0x0, CHG_BIT_1 = 0x1, - CHG_BUS = 0x2, - CHG_QUAD = 0x3, - CHG_ARRAY = 0x4, - CHG_FLOAT = 0x5, - CHG_DOUBLE = 0x6, + CHG_CDATA = 0x2, + CHG_SDATA = 0x3, + CHG_IDATA = 0x4, + CHG_QDATA = 0x5, + CHG_WDATA = 0x6, + CHG_FLOAT = 0x7, + CHG_DOUBLE = 0x8, // TODO: full.. TIME_CHANGE = 0xd, END = 0xe, // End of buffer @@ -122,6 +124,7 @@ private: bool m_fullDump; ///< Whether a full dump is required on the next call to 'dump' vluint32_t m_nextCode; ///< Next code number to assign vluint32_t m_numSignals; ///< Number of distinct signals + vluint32_t m_maxBits; ///< Number of bits in the widest signal std::string m_moduleName; ///< Name of module being trace initialized now char m_scopeEscape; double m_timeRes; ///< Time resolution (ns/ms etc) @@ -176,6 +179,7 @@ protected: vluint32_t nextCode() const { return m_nextCode; } vluint32_t numSignals() const { return m_numSignals; } + vluint32_t maxBits() const { return m_maxBits; } const std::string& moduleName() const { return m_moduleName; } void fullDump(bool value) { m_fullDump = value; } vluint64_t timeLastDump() { return m_timeLastDump; } @@ -251,47 +255,65 @@ public: // these here, but we cannot afford dynamic dispatch for calling these as // this is very hot code during tracing. - // duck-typed void emitBit(vluint32_t code, vluint32_t newval) = 0; - // duck-typed void emitBus(vluint32_t code, vluint32_t newval, int bits) = 0; - // duck-typed void emitQuad(vluint32_t code, vluint64_t newval, int bits) = 0; - // duck-typed void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) = 0; + // duck-typed void emitBit(vluint32_t code, CData newval) = 0; + // duck-typed void emitCData(vluint32_t code, CData newval, int bits) = 0; + // duck-typed void emitSData(vluint32_t code, SData newval, int bits) = 0; + // duck-typed void emitIData(vluint32_t code, IData newval, int bits) = 0; + // duck-typed void emitQData(vluint32_t code, QData newval, int bits) = 0; + // duck-typed void emitWData(vluint32_t code, const WData* newvalp, int bits) = 0; // duck-typed void emitFloat(vluint32_t code, float newval) = 0; // duck-typed void emitDouble(vluint32_t code, double newval) = 0; vluint32_t* oldp(vluint32_t code) { return m_sigs_oldvalp + code; } // Write to previous value buffer value and emit trace entry. - void fullBit(vluint32_t* oldp, vluint32_t newval); - void fullBus(vluint32_t* oldp, vluint32_t newval, int bits); - void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits); - void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits); + void fullBit(vluint32_t* oldp, CData newval); + void fullCData(vluint32_t* oldp, CData newval, int bits); + void fullSData(vluint32_t* oldp, SData newval, int bits); + void fullIData(vluint32_t* oldp, IData newval, int bits); + void fullQData(vluint32_t* oldp, QData newval, int bits); + void fullWData(vluint32_t* oldp, const WData* newvalp, int bits); void fullFloat(vluint32_t* oldp, float newval); void fullDouble(vluint32_t* oldp, double newval); #ifdef VL_TRACE_THREADED // Threaded tracing. Just dump everything in the trace buffer - inline void chgBit(vluint32_t code, vluint32_t newval) { + inline void chgBit(vluint32_t code, CData newval) { m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval; m_traceBufferWritep[1] = code; m_traceBufferWritep += 2; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgBus(vluint32_t code, vluint32_t newval, int bits) { - m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS; + inline void chgCData(vluint32_t code, CData newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_CDATA; m_traceBufferWritep[1] = code; m_traceBufferWritep[2] = newval; m_traceBufferWritep += 3; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) { - m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD; + inline void chgSData(vluint32_t code, SData newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_SDATA; m_traceBufferWritep[1] = code; - *reinterpret_cast(m_traceBufferWritep + 2) = newval; + m_traceBufferWritep[2] = newval; + m_traceBufferWritep += 3; + VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); + } + inline void chgIData(vluint32_t code, IData newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_IDATA; + m_traceBufferWritep[1] = code; + m_traceBufferWritep[2] = newval; + m_traceBufferWritep += 3; + VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); + } + inline void chgQData(vluint32_t code, QData newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QDATA; + m_traceBufferWritep[1] = code; + *reinterpret_cast(m_traceBufferWritep + 2) = newval; m_traceBufferWritep += 4; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) { - m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY; + inline void chgWData(vluint32_t code, const WData* newvalp, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_WDATA; m_traceBufferWritep[1] = code; m_traceBufferWritep += 2; for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; } @@ -324,22 +346,30 @@ public: // thread and are called chg*Impl // Check previous dumped value of signal. If changed, then emit trace entry - inline void CHG(Bit)(vluint32_t* oldp, vluint32_t newval) { + inline void CHG(Bit)(vluint32_t* oldp, CData newval) { const vluint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullBit(oldp, newval); } - inline void CHG(Bus)(vluint32_t* oldp, vluint32_t newval, int bits) { + inline void CHG(CData)(vluint32_t* oldp, CData newval, int bits) { const vluint32_t diff = *oldp ^ newval; - if (VL_UNLIKELY(diff)) fullBus(oldp, newval, bits); + if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits); } - inline void CHG(Quad)(vluint32_t* oldp, vluint64_t newval, int bits) { - const vluint64_t diff = *reinterpret_cast(oldp) ^ newval; - if (VL_UNLIKELY(diff)) fullQuad(oldp, newval, bits); + inline void CHG(SData)(vluint32_t* oldp, SData newval, int bits) { + const vluint32_t diff = *oldp ^ newval; + if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits); } - inline void CHG(Array)(vluint32_t* oldp, const vluint32_t* newvalp, int bits) { + inline void CHG(IData)(vluint32_t* oldp, IData newval, int bits) { + const vluint32_t diff = *oldp ^ newval; + if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits); + } + inline void CHG(QData)(vluint32_t* oldp, QData newval, int bits) { + const vluint64_t diff = *reinterpret_cast(oldp) ^ newval; + if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits); + } + inline void CHG(WData)(vluint32_t* oldp, const WData* newvalp, int bits) { for (int i = 0; i < (bits + 31) / 32; ++i) { if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) { - fullArray(oldp, newvalp, bits); + fullWData(oldp, newvalp, bits); return; } } diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.cpp index 2e5345d65..78dc19a11 100644 --- a/include/verilated_trace_imp.cpp +++ b/include/verilated_trace_imp.cpp @@ -23,6 +23,7 @@ # error "This file should be included in trace format implementations" #endif +#include "verilated_intrinsics.h" #include "verilated_trace.h" #if 0 @@ -166,22 +167,34 @@ template <> void VerilatedTrace::workerThreadMain() { VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top); chgBitImpl(oldp, 1); continue; - case VerilatedTraceCommand::CHG_BUS: - VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top); + case VerilatedTraceCommand::CHG_CDATA: + VL_TRACE_THREAD_DEBUG("Command CHG_CDATA " << top); // Bits stored in bottom byte of command - chgBusImpl(oldp, *readp, top); + chgCDataImpl(oldp, *readp, top); readp += 1; continue; - case VerilatedTraceCommand::CHG_QUAD: - VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top); + case VerilatedTraceCommand::CHG_SDATA: + VL_TRACE_THREAD_DEBUG("Command CHG_SDATA " << top); // Bits stored in bottom byte of command - chgQuadImpl(oldp, *reinterpret_cast(readp), top); + chgSDataImpl(oldp, *readp, top); + readp += 1; + continue; + case VerilatedTraceCommand::CHG_IDATA: + VL_TRACE_THREAD_DEBUG("Command CHG_IDATA " << top); + // Bits stored in bottom byte of command + chgIDataImpl(oldp, *readp, top); + readp += 1; + continue; + case VerilatedTraceCommand::CHG_QDATA: + VL_TRACE_THREAD_DEBUG("Command CHG_QDATA " << top); + // Bits stored in bottom byte of command + chgQDataImpl(oldp, *reinterpret_cast(readp), top); readp += 2; continue; - case VerilatedTraceCommand::CHG_ARRAY: - VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top); - chgArrayImpl(oldp, readp, top); - readp += (top + 31) / 32; + case VerilatedTraceCommand::CHG_WDATA: + VL_TRACE_THREAD_DEBUG("Command CHG_WDATA " << top); + chgWDataImpl(oldp, readp, top); + readp += VL_WORDS_I(top); continue; case VerilatedTraceCommand::CHG_FLOAT: VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top); @@ -284,6 +297,7 @@ VerilatedTrace::VerilatedTrace() , m_fullDump(true) , m_nextCode(0) , m_numSignals(0) + , m_maxBits(0) , m_scopeEscape('.') , m_timeRes(1e-9) , m_timeUnit(1e-9) @@ -318,6 +332,7 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { const vluint32_t expectedCodes = nextCode(); m_nextCode = 1; m_numSignals = 0; + m_maxBits = 0; // Call all initialize callbacks, which will call decl* for each signal. for (vluint32_t ent = 0; ent < m_callbacks.size(); ++ent) { @@ -355,10 +370,11 @@ void VerilatedTrace::declCode(vluint32_t code, vluint32_t bits, bo } // Note: The tri-state flag is not used by Verilator, but is here for // compatibility with some foreign code. - int codesNeeded = (bits + 31) / 32; + int codesNeeded = VL_WORDS_I(bits); if (tri) codesNeeded *= 2; m_nextCode = std::max(m_nextCode, code + codesNeeded); ++m_numSignals; + m_maxBits = std::max(m_maxBits, bits); } //========================================================================= @@ -486,35 +502,139 @@ void VerilatedTrace::addCallback(callback_t initcb, callback_t ful // that this file must be included in the format specific implementation, so // the emit* functions can be inlined for performance. -template <> void VerilatedTrace::fullBit(vluint32_t* oldp, vluint32_t newval) { +template <> void VerilatedTrace::fullBit(vluint32_t* oldp, CData newval) { *oldp = newval; self()->emitBit(oldp - m_sigs_oldvalp, newval); } template <> -void VerilatedTrace::fullBus(vluint32_t* oldp, vluint32_t newval, int bits) { +void VerilatedTrace::fullCData(vluint32_t* oldp, CData newval, int bits) { *oldp = newval; - self()->emitBus(oldp - m_sigs_oldvalp, newval, bits); + self()->emitCData(oldp - m_sigs_oldvalp, newval, bits); } template <> -void VerilatedTrace::fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) { - *reinterpret_cast(oldp) = newval; - self()->emitQuad(oldp - m_sigs_oldvalp, newval, bits); +void VerilatedTrace::fullSData(vluint32_t* oldp, SData newval, int bits) { + *oldp = newval; + self()->emitSData(oldp - m_sigs_oldvalp, newval, bits); } + template <> -void VerilatedTrace::fullArray(vluint32_t* oldp, const vluint32_t* newvalp, - int bits) { - for (int i = 0; i < (bits + 31) / 32; ++i) oldp[i] = newvalp[i]; - self()->emitArray(oldp - m_sigs_oldvalp, newvalp, bits); +void VerilatedTrace::fullIData(vluint32_t* oldp, IData newval, int bits) { + *oldp = newval; + self()->emitIData(oldp - m_sigs_oldvalp, newval, bits); } + +template <> +void VerilatedTrace::fullQData(vluint32_t* oldp, QData newval, int bits) { + *reinterpret_cast(oldp) = newval; + self()->emitQData(oldp - m_sigs_oldvalp, newval, bits); +} + +template <> +void VerilatedTrace::fullWData(vluint32_t* oldp, const WData* newvalp, int bits) { + for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i]; + self()->emitWData(oldp - m_sigs_oldvalp, newvalp, bits); +} + template <> void VerilatedTrace::fullFloat(vluint32_t* oldp, float newval) { // cppcheck-suppress invalidPointerCast *reinterpret_cast(oldp) = newval; self()->emitFloat(oldp - m_sigs_oldvalp, newval); } + template <> void VerilatedTrace::fullDouble(vluint32_t* oldp, double newval) { // cppcheck-suppress invalidPointerCast *reinterpret_cast(oldp) = newval; self()->emitDouble(oldp - m_sigs_oldvalp, newval); } + +//========================================================================= +// Primitives converting binary values to strings... + +// All of these take a destination pointer where the string will be emitted, +// and a value to convert. There are a couple of variants for efficiency. + +inline static void cvtCDataToStr(char* dstp, CData value) { +#ifdef VL_HAVE_SSE2 + // Similar to cvtSDataToStr but only the bottom 8 byte lanes are used + const __m128i a = _mm_cvtsi32_si128(value); + const __m128i b = _mm_unpacklo_epi8(a, a); + const __m128i c = _mm_shufflelo_epi16(b, 0); + const __m128i m = _mm_set1_epi64x(0x0102040810204080); + const __m128i d = _mm_cmpeq_epi8(_mm_and_si128(c, m), m); + const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), d); + _mm_storel_epi64(reinterpret_cast<__m128i*>(dstp), result); +#else + dstp[0] = '0' | static_cast((value >> 7) & 1); + dstp[1] = '0' | static_cast((value >> 6) & 1); + dstp[2] = '0' | static_cast((value >> 5) & 1); + dstp[3] = '0' | static_cast((value >> 4) & 1); + dstp[4] = '0' | static_cast((value >> 3) & 1); + dstp[5] = '0' | static_cast((value >> 2) & 1); + dstp[6] = '0' | static_cast((value >> 1) & 1); + dstp[7] = '0' | static_cast(value & 1); +#endif +} + +inline static void cvtSDataToStr(char* dstp, SData value) { +#ifdef VL_HAVE_SSE2 + // We want each bit in the 16-bit input value to end up in a byte lane + // within the 128-bit XMM register. Note that x86 is little-endian and we + // want the MSB of the input at the low address, so we will bit-reverse + // at the same time. + + // Put value in bottom of 128-bit register a[15:0] = value + const __m128i a = _mm_cvtsi32_si128(value); + // Interleave bytes with themselves + // b[15: 0] = {2{a[ 7:0]}} == {2{value[ 7:0]}} + // b[31:16] = {2{a[15:8]}} == {2{value[15:8]}} + const __m128i b = _mm_unpacklo_epi8(a, a); + // Shuffle bottom 64 bits, note swapping high bytes with low bytes + // c[31: 0] = {2{b[31:16]}} == {4{value[15:8}} + // c[63:32] = {2{b[15: 0]}} == {4{value[ 7:0}} + const __m128i c = _mm_shufflelo_epi16(b, 0x05); + // Shuffle whole register + // d[ 63: 0] = {2{c[31: 0]}} == {8{value[15:8}} + // d[126:54] = {2{c[63:32]}} == {8{value[ 7:0}} + const __m128i d = _mm_shuffle_epi32(c, 0x50); + // Test each bit within the bytes, this sets each byte lane to 0 + // if the bit for that lane is 0 and to 0xff if the bit is 1. + const __m128i m = _mm_set1_epi64x(0x0102040810204080); + const __m128i e = _mm_cmpeq_epi8(_mm_and_si128(d, m), m); + // Convert to ASCII by subtracting the masks from ASCII '0': + // '0' - 0 is '0', '0' - -1 is '1' + const __m128i result = _mm_sub_epi8(_mm_set1_epi8('0'), e); + // Store the 16 characters to the un-aligned buffer + _mm_storeu_si128(reinterpret_cast<__m128i*>(dstp), result); +#else + cvtCDataToStr(dstp, value >> 8); + cvtCDataToStr(dstp + 8, value); +#endif +} + +inline static void cvtIDataToStr(char* dstp, IData value) { +#ifdef VL_HAVE_AVX2 + // Similar to cvtSDataToStr but the bottom 16-bits are processed in the + // top half of the YMM registerss + const __m256i a = _mm256_insert_epi32(_mm256_undefined_si256(), value, 0); + const __m256i b = _mm256_permute4x64_epi64(a, 0); + const __m256i s = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3); + const __m256i c = _mm256_shuffle_epi8(b, s); + const __m256i m = _mm256_set1_epi64x(0x0102040810204080); + const __m256i d = _mm256_cmpeq_epi8(_mm256_and_si256(c, m), m); + const __m256i result = _mm256_sub_epi8(_mm256_set1_epi8('0'), d); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstp), result); +#else + cvtSDataToStr(dstp, value >> 16); + cvtSDataToStr(dstp + 16, value); +#endif +} + +inline static void cvtQDataToStr(char* dstp, QData value) { + cvtIDataToStr(dstp, value >> 32); + cvtIDataToStr(dstp + 32, value); +} + +#define cvtEDataToStr cvtIDataToStr diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp index e1ffd2293..e940ee789 100644 --- a/include/verilated_vcd_c.cpp +++ b/include/verilated_vcd_c.cpp @@ -35,6 +35,8 @@ # include #endif +#include "verilated_intrinsics.h" + // SPDIFF_ON #ifndef O_LARGEFILE // For example on WIN32 @@ -606,20 +608,14 @@ void VerilatedVcd::declTriArray(vluint32_t code, const char* name, bool array, i #endif // VL_TRACE_VCD_OLD_API //============================================================================= -// Trace recording routines +// Trace rendering prinitives -//============================================================================= -// Emit trace entries - -#define VL_VCD_SUFFIXP(code) (m_suffixesp + (code)*VL_TRACE_SUFFIX_ENTRY_SIZE) - -// Emit suffix, write back write pointer, check buffer void VerilatedVcd::finishLine(vluint32_t code, char* writep) { - const char* const suffixp = VL_VCD_SUFFIXP(code); + const char* const suffixp = m_suffixesp + code * VL_TRACE_SUFFIX_ENTRY_SIZE; // Copy the whole suffix (this avoid having hard to predict branches which // helps a lot). Note: The maximum length of the suffix is // VL_TRACE_MAX_VCD_CODE_SIZE + 2 == 7, but we unroll this here for speed. -#ifdef __x86_64__ +#ifdef VL_X86_64 // Copy the whole 8 bytes in one go, this works on little-endian machines // supporting unaligned stores. *reinterpret_cast(writep) = *reinterpret_cast(suffixp); @@ -639,12 +635,15 @@ void VerilatedVcd::finishLine(vluint32_t code, char* writep) { bufferCheck(); } +//============================================================================= +// emit* trace routines + // Note: emit* are only ever called from one place (full* in // verilated_trace_imp.cpp, which is included in this file at the top), // so always inline them. VL_ATTR_ALWINLINE -void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) { +void VerilatedVcd::emitBit(vluint32_t code, CData newval) { // Don't prefetch suffix as it's a bit too late; char* wp = m_writep; *wp++ = '0' | static_cast(newval); @@ -652,182 +651,56 @@ void VerilatedVcd::emitBit(vluint32_t code, vluint32_t newval) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitBus(vluint32_t code, vluint32_t newval, int bits) { - VL_PREFETCH_RD(VL_VCD_SUFFIXP(code)); +void VerilatedVcd::emitCData(vluint32_t code, CData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; - wp += bits; - // clang-format off - switch (bits) { - case 32: wp[-32] = '0' | static_cast((newval >> 31) ); //FALLTHRU - case 31: wp[-31] = '0' | static_cast((newval >> 30) & 1); //FALLTHRU - case 30: wp[-30] = '0' | static_cast((newval >> 29) & 1); //FALLTHRU - case 29: wp[-29] = '0' | static_cast((newval >> 28) & 1); //FALLTHRU - case 28: wp[-28] = '0' | static_cast((newval >> 27) & 1); //FALLTHRU - case 27: wp[-27] = '0' | static_cast((newval >> 26) & 1); //FALLTHRU - case 26: wp[-26] = '0' | static_cast((newval >> 25) & 1); //FALLTHRU - case 25: wp[-25] = '0' | static_cast((newval >> 24) & 1); //FALLTHRU - case 24: wp[-24] = '0' | static_cast((newval >> 23) & 1); //FALLTHRU - case 23: wp[-23] = '0' | static_cast((newval >> 22) & 1); //FALLTHRU - case 22: wp[-22] = '0' | static_cast((newval >> 21) & 1); //FALLTHRU - case 21: wp[-21] = '0' | static_cast((newval >> 20) & 1); //FALLTHRU - case 20: wp[-20] = '0' | static_cast((newval >> 19) & 1); //FALLTHRU - case 19: wp[-19] = '0' | static_cast((newval >> 18) & 1); //FALLTHRU - case 18: wp[-18] = '0' | static_cast((newval >> 17) & 1); //FALLTHRU - case 17: wp[-17] = '0' | static_cast((newval >> 16) & 1); //FALLTHRU - case 16: wp[-16] = '0' | static_cast((newval >> 15) & 1); //FALLTHRU - case 15: wp[-15] = '0' | static_cast((newval >> 14) & 1); //FALLTHRU - case 14: wp[-14] = '0' | static_cast((newval >> 13) & 1); //FALLTHRU - case 13: wp[-13] = '0' | static_cast((newval >> 12) & 1); //FALLTHRU - case 12: wp[-12] = '0' | static_cast((newval >> 11) & 1); //FALLTHRU - case 11: wp[-11] = '0' | static_cast((newval >> 10) & 1); //FALLTHRU - case 10: wp[-10] = '0' | static_cast((newval >> 9) & 1); //FALLTHRU - case 9: wp[ -9] = '0' | static_cast((newval >> 8) & 1); //FALLTHRU - case 8: wp[ -8] = '0' | static_cast((newval >> 7) & 1); //FALLTHRU - case 7: wp[ -7] = '0' | static_cast((newval >> 6) & 1); //FALLTHRU - case 6: wp[ -6] = '0' | static_cast((newval >> 5) & 1); //FALLTHRU - case 5: wp[ -5] = '0' | static_cast((newval >> 4) & 1); //FALLTHRU - case 4: wp[ -4] = '0' | static_cast((newval >> 3) & 1); //FALLTHRU - case 3: wp[ -3] = '0' | static_cast((newval >> 2) & 1); //FALLTHRU - case 2: wp[ -2] = '0' | static_cast((newval >> 1) & 1); //FALLTHRU - /*bit*/ wp[ -1] = '0' | static_cast((newval ) & 1); //FALLTHRU - } - // clang-format on - finishLine(code, wp); + cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits)); + finishLine(code, wp + bits); } VL_ATTR_ALWINLINE -void VerilatedVcd::emitQuad(vluint32_t code, vluint64_t newval, int bits) { - VL_PREFETCH_RD(VL_VCD_SUFFIXP(code)); +void VerilatedVcd::emitSData(vluint32_t code, SData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; - // Handle the top 32 bits within the 64 bit input - const int bitsInTopHalf = bits - 32; - wp += bitsInTopHalf; - // clang-format off - switch (bitsInTopHalf) { - case 32: wp[-32] = '0' | static_cast((newval >> 63) ); //FALLTHRU - case 31: wp[-31] = '0' | static_cast((newval >> 62) & 1); //FALLTHRU - case 30: wp[-30] = '0' | static_cast((newval >> 61) & 1); //FALLTHRU - case 29: wp[-29] = '0' | static_cast((newval >> 60) & 1); //FALLTHRU - case 28: wp[-28] = '0' | static_cast((newval >> 59) & 1); //FALLTHRU - case 27: wp[-27] = '0' | static_cast((newval >> 58) & 1); //FALLTHRU - case 26: wp[-26] = '0' | static_cast((newval >> 57) & 1); //FALLTHRU - case 25: wp[-25] = '0' | static_cast((newval >> 56) & 1); //FALLTHRU - case 24: wp[-24] = '0' | static_cast((newval >> 55) & 1); //FALLTHRU - case 23: wp[-23] = '0' | static_cast((newval >> 54) & 1); //FALLTHRU - case 22: wp[-22] = '0' | static_cast((newval >> 53) & 1); //FALLTHRU - case 21: wp[-21] = '0' | static_cast((newval >> 52) & 1); //FALLTHRU - case 20: wp[-20] = '0' | static_cast((newval >> 51) & 1); //FALLTHRU - case 19: wp[-19] = '0' | static_cast((newval >> 50) & 1); //FALLTHRU - case 18: wp[-18] = '0' | static_cast((newval >> 49) & 1); //FALLTHRU - case 17: wp[-17] = '0' | static_cast((newval >> 48) & 1); //FALLTHRU - case 16: wp[-16] = '0' | static_cast((newval >> 47) & 1); //FALLTHRU - case 15: wp[-15] = '0' | static_cast((newval >> 46) & 1); //FALLTHRU - case 14: wp[-14] = '0' | static_cast((newval >> 45) & 1); //FALLTHRU - case 13: wp[-13] = '0' | static_cast((newval >> 44) & 1); //FALLTHRU - case 12: wp[-12] = '0' | static_cast((newval >> 43) & 1); //FALLTHRU - case 11: wp[-11] = '0' | static_cast((newval >> 42) & 1); //FALLTHRU - case 10: wp[-10] = '0' | static_cast((newval >> 41) & 1); //FALLTHRU - case 9: wp[ -9] = '0' | static_cast((newval >> 40) & 1); //FALLTHRU - case 8: wp[ -8] = '0' | static_cast((newval >> 39) & 1); //FALLTHRU - case 7: wp[ -7] = '0' | static_cast((newval >> 38) & 1); //FALLTHRU - case 6: wp[ -6] = '0' | static_cast((newval >> 37) & 1); //FALLTHRU - case 5: wp[ -5] = '0' | static_cast((newval >> 36) & 1); //FALLTHRU - case 4: wp[ -4] = '0' | static_cast((newval >> 35) & 1); //FALLTHRU - case 3: wp[ -3] = '0' | static_cast((newval >> 34) & 1); //FALLTHRU - case 2: wp[ -2] = '0' | static_cast((newval >> 33) & 1); //FALLTHRU - case 1: wp[ -1] = '0' | static_cast((newval >> 32) & 1); //FALLTHRU - } - // clang-format on - // Handle the bottom 32 bits within the 64 bit input - vluint32_t val = static_cast(newval); // Truncate to bottom 32 bits - int loops = 4; - do { - wp[0] = '0' | static_cast((val >> 31)); - wp[1] = '0' | static_cast((val >> 30) & 1); - wp[2] = '0' | static_cast((val >> 29) & 1); - wp[3] = '0' | static_cast((val >> 28) & 1); - wp[4] = '0' | static_cast((val >> 27) & 1); - wp[5] = '0' | static_cast((val >> 26) & 1); - wp[6] = '0' | static_cast((val >> 25) & 1); - wp[7] = '0' | static_cast((val >> 24) & 1); - wp += 8; - val <<= 8; - } while (--loops); - - finishLine(code, wp); + cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits)); + finishLine(code, wp + bits); } VL_ATTR_ALWINLINE -void VerilatedVcd::emitArray(vluint32_t code, const vluint32_t* newvalp, int bits) { - VL_PREFETCH_RD(VL_VCD_SUFFIXP(code)); - int words = (bits + 31) / 32; +void VerilatedVcd::emitIData(vluint32_t code, IData newval, int bits) { + char* wp = m_writep; + *wp++ = 'b'; + cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits)); + finishLine(code, wp + bits); +} + +VL_ATTR_ALWINLINE +void VerilatedVcd::emitQData(vluint32_t code, QData newval, int bits) { + char* wp = m_writep; + *wp++ = 'b'; + cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits)); + finishLine(code, wp + bits); +} + +VL_ATTR_ALWINLINE +void VerilatedVcd::emitWData(vluint32_t code, const WData* newvalp, int bits) { + int words = VL_WORDS_I(bits); char* wp = m_writep; *wp++ = 'b'; // Handle the most significant word - vluint32_t val = newvalp[--words]; - const int bitsInMSW = bits % 32 == 0 ? 32 : bits % 32; + const int bitsInMSW = VL_BITBIT_E(bits) ? VL_BITBIT_E(bits) : VL_EDATASIZE; + cvtEDataToStr(wp, newvalp[--words] << (VL_EDATASIZE - bitsInMSW)); wp += bitsInMSW; - // clang-format off - switch (bitsInMSW) { - case 32: wp[-32] = '0' | static_cast((val >> 31) ); //FALLTHRU - case 31: wp[-31] = '0' | static_cast((val >> 30) & 1); //FALLTHRU - case 30: wp[-30] = '0' | static_cast((val >> 29) & 1); //FALLTHRU - case 29: wp[-29] = '0' | static_cast((val >> 28) & 1); //FALLTHRU - case 28: wp[-28] = '0' | static_cast((val >> 27) & 1); //FALLTHRU - case 27: wp[-27] = '0' | static_cast((val >> 26) & 1); //FALLTHRU - case 26: wp[-26] = '0' | static_cast((val >> 25) & 1); //FALLTHRU - case 25: wp[-25] = '0' | static_cast((val >> 24) & 1); //FALLTHRU - case 24: wp[-24] = '0' | static_cast((val >> 23) & 1); //FALLTHRU - case 23: wp[-23] = '0' | static_cast((val >> 22) & 1); //FALLTHRU - case 22: wp[-22] = '0' | static_cast((val >> 21) & 1); //FALLTHRU - case 21: wp[-21] = '0' | static_cast((val >> 20) & 1); //FALLTHRU - case 20: wp[-20] = '0' | static_cast((val >> 19) & 1); //FALLTHRU - case 19: wp[-19] = '0' | static_cast((val >> 18) & 1); //FALLTHRU - case 18: wp[-18] = '0' | static_cast((val >> 17) & 1); //FALLTHRU - case 17: wp[-17] = '0' | static_cast((val >> 16) & 1); //FALLTHRU - case 16: wp[-16] = '0' | static_cast((val >> 15) & 1); //FALLTHRU - case 15: wp[-15] = '0' | static_cast((val >> 14) & 1); //FALLTHRU - case 14: wp[-14] = '0' | static_cast((val >> 13) & 1); //FALLTHRU - case 13: wp[-13] = '0' | static_cast((val >> 12) & 1); //FALLTHRU - case 12: wp[-12] = '0' | static_cast((val >> 11) & 1); //FALLTHRU - case 11: wp[-11] = '0' | static_cast((val >> 10) & 1); //FALLTHRU - case 10: wp[-10] = '0' | static_cast((val >> 9) & 1); //FALLTHRU - case 9: wp[ -9] = '0' | static_cast((val >> 8) & 1); //FALLTHRU - case 8: wp[ -8] = '0' | static_cast((val >> 7) & 1); //FALLTHRU - case 7: wp[ -7] = '0' | static_cast((val >> 6) & 1); //FALLTHRU - case 6: wp[ -6] = '0' | static_cast((val >> 5) & 1); //FALLTHRU - case 5: wp[ -5] = '0' | static_cast((val >> 4) & 1); //FALLTHRU - case 4: wp[ -4] = '0' | static_cast((val >> 3) & 1); //FALLTHRU - case 3: wp[ -3] = '0' | static_cast((val >> 2) & 1); //FALLTHRU - case 2: wp[ -2] = '0' | static_cast((val >> 1) & 1); //FALLTHRU - case 1: wp[ -1] = '0' | static_cast((val ) & 1); //FALLTHRU - } - // clang-format on // Handle the remaining words while (words > 0) { - vluint32_t val = newvalp[--words]; - int loops = 4; - do { - wp[0] = '0' | static_cast((val >> 31)); - wp[1] = '0' | static_cast((val >> 30) & 1); - wp[2] = '0' | static_cast((val >> 29) & 1); - wp[3] = '0' | static_cast((val >> 28) & 1); - wp[4] = '0' | static_cast((val >> 27) & 1); - wp[5] = '0' | static_cast((val >> 26) & 1); - wp[6] = '0' | static_cast((val >> 25) & 1); - wp[7] = '0' | static_cast((val >> 24) & 1); - wp += 8; - val <<= 8; - } while (--loops); + cvtEDataToStr(wp, newvalp[--words]); + wp += VL_EDATASIZE; } finishLine(code, wp); } VL_ATTR_ALWINLINE void VerilatedVcd::emitFloat(vluint32_t code, float newval) { - VL_PREFETCH_RD(VL_VCD_SUFFIXP(code)); char* wp = m_writep; // Buffer can't overflow before sprintf; we sized during declaration sprintf(wp, "r%.16g", static_cast(newval)); @@ -837,7 +710,6 @@ void VerilatedVcd::emitFloat(vluint32_t code, float newval) { VL_ATTR_ALWINLINE void VerilatedVcd::emitDouble(vluint32_t code, double newval) { - VL_PREFETCH_RD(VL_VCD_SUFFIXP(code)); char* wp = m_writep; // Buffer can't overflow before sprintf; we sized during declaration sprintf(wp, "r%.16g", newval); @@ -845,8 +717,6 @@ void VerilatedVcd::emitDouble(vluint32_t code, double newval) { finishLine(code, wp); } -#undef VL_VCD_SUFFIXP - #ifdef VL_TRACE_VCD_OLD_API void VerilatedVcd::fullBit(vluint32_t code, const vluint32_t newval) { diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h index d9049366f..972c8c0e7 100644 --- a/include/verilated_vcd_c.h +++ b/include/verilated_vcd_c.h @@ -124,10 +124,12 @@ protected: // Implementations of duck-typed methods for VerilatedTrace. These are // called from only one place (namely full*) so always inline them. - inline void emitBit(vluint32_t code, vluint32_t newval); - inline void emitBus(vluint32_t code, vluint32_t newval, int bits); - inline void emitQuad(vluint32_t code, vluint64_t newval, int bits); - inline void emitArray(vluint32_t code, const vluint32_t* newvalp, int bits); + inline void emitBit(vluint32_t code, CData newval); + inline void emitCData(vluint32_t code, CData newval, int bits); + inline void emitSData(vluint32_t code, SData newval, int bits); + inline void emitIData(vluint32_t code, IData newval, int bits); + inline void emitQData(vluint32_t code, QData newval, int bits); + inline void emitWData(vluint32_t code, const WData* newvalp, int bits); inline void emitFloat(vluint32_t code, float newval); inline void emitDouble(vluint32_t code, double newval); @@ -176,37 +178,48 @@ public: int lsb); void declTriArray(vluint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); - //========================================================================= - // Write back to previous value buffer value and emit - void fullBit(vluint32_t* oldp, vluint32_t newval) { fullBit(oldp - this->oldp(0), newval); } - void fullBus(vluint32_t* oldp, vluint32_t newval, int bits) { + void fullBit(vluint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); } + void fullCData(vluint32_t* oldp, CData newval, int bits) { fullBus(oldp - this->oldp(0), newval, bits); } - void fullQuad(vluint32_t* oldp, vluint64_t newval, int bits) { + void fullSData(vluint32_t* oldp, SData newval, int bits) { + fullBus(oldp - this->oldp(0), newval, bits); + } + void fullIData(vluint32_t* oldp, IData newval, int bits) { + fullBus(oldp - this->oldp(0), newval, bits); + } + void fullQData(vluint32_t* oldp, QData newval, int bits) { fullQuad(oldp - this->oldp(0), newval, bits); } - void fullArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) { + void fullWData(vluint32_t* oldp, const WData* newvalp, int bits) { fullArray(oldp - this->oldp(0), newvalp, bits); } void fullFloat(vluint32_t* oldp, float newval) { fullFloat(oldp - this->oldp(0), newval); } void fullDouble(vluint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); } - //========================================================================= - // Check previous value and emit if changed - - void chgBit(vluint32_t* oldp, vluint32_t newval) { chgBit(oldp - this->oldp(0), newval); } - void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) { + inline void chgBit(vluint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); } + inline void chgCData(vluint32_t* oldp, CData newval, int bits) { chgBus(oldp - this->oldp(0), newval, bits); } - void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) { + inline void chgSData(vluint32_t* oldp, SData newval, int bits) { + chgBus(oldp - this->oldp(0), newval, bits); + } + inline void chgIData(vluint32_t* oldp, IData newval, int bits) { + chgBus(oldp - this->oldp(0), newval, bits); + } + inline void chgQData(vluint32_t* oldp, QData newval, int bits) { chgQuad(oldp - this->oldp(0), newval, bits); } - void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) { + inline void chgWData(vluint32_t* oldp, const WData* newvalp, int bits) { chgArray(oldp - this->oldp(0), newvalp, bits); } - void chgFloat(vluint32_t* oldp, float newval) { chgFloat(oldp - this->oldp(0), newval); } - void chgDouble(vluint32_t* oldp, double newval) { chgDouble(oldp - this->oldp(0), newval); } + inline void chgFloat(vluint32_t* oldp, float newval) { + chgFloat(oldp - this->oldp(0), newval); + } + inline void chgDouble(vluint32_t* oldp, double newval) { + chgDouble(oldp - this->oldp(0), newval); + } /// Inside dumping routines, dump one signal, faster when not inlined /// due to code size reduction. diff --git a/include/verilatedos.h b/include/verilatedos.h index 8f689190b..352f7f87b 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -475,6 +475,16 @@ typedef unsigned long long vluint64_t; ///< 64-bit unsigned type #else # define VL_STRCASECMP strcasecmp #endif + +//========================================================================= +// Macros controlling target specific optimizations + +// Define VL_PORTABLE_ONLY to disable all target specific optimizations +#ifndef VL_PORTABLE_ONLY +# ifdef __x86_64__ +# define VL_X86_64 1 +# endif +#endif // VL_PORTABLE_ONLY // clang-format on //========================================================================= diff --git a/src/V3EmitC.cpp b/src/V3EmitC.cpp index 061e0b1b3..d31156ccf 100644 --- a/src/V3EmitC.cpp +++ b/src/V3EmitC.cpp @@ -3550,20 +3550,23 @@ class EmitCTrace : EmitCStmts { const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL || m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB); const string func = full ? "full" : "chg"; - bool emitWidth = false; + bool emitWidth = true; if (nodep->dtypep()->basicp()->isDouble()) { puts("vcdp->" + func + "Double"); + emitWidth = false; } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) { - puts("vcdp->" + func + "Array"); - emitWidth = true; + puts("vcdp->" + func + "WData"); } else if (nodep->isQuad()) { - puts("vcdp->" + func + "Quad"); - emitWidth = true; + puts("vcdp->" + func + "QData"); + } else if (nodep->declp()->widthMin() > 16) { + puts("vcdp->" + func + "IData"); + } else if (nodep->declp()->widthMin() > 8) { + puts("vcdp->" + func + "SData"); } else if (nodep->declp()->widthMin() > 1) { - puts("vcdp->" + func + "Bus"); - emitWidth = true; + puts("vcdp->" + func + "CData"); } else { puts("vcdp->" + func + "Bit"); + emitWidth = false; } const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords()); diff --git a/test_regress/t/t_trace_array_fst_portable.pl b/test_regress/t/t_trace_array_fst_portable.pl new file mode 100755 index 000000000..0b15dea3c --- /dev/null +++ b/test_regress/t/t_trace_array_fst_portable.pl @@ -0,0 +1,28 @@ +#!/usr/bin/perl +if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2003-2009 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +scenarios(vlt => 1); + +top_filename("t/t_trace_array.v"); +$Self->{golden_filename} = "t/t_trace_array_fst.out"; + +compile( + verilator_flags2 => ['--cc --trace-fst --trace-structs', + '-CFLAGS -DVL_PORTABLE_ONLY'], + ); + +execute( + check_finished => 1, + ); + +fst_identical($Self->trace_filename, $Self->{golden_filename}); + +ok(1); +1; diff --git a/test_regress/t/t_trace_complex_portable.pl b/test_regress/t/t_trace_complex_portable.pl new file mode 100755 index 000000000..b168579a4 --- /dev/null +++ b/test_regress/t/t_trace_complex_portable.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl +if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2003-2009 by Wilson Snyder. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +# Same test as t_trace_complex, but exercising the old VCD tracing API + +scenarios(vlt => 1); + +top_filename("t/t_trace_complex.v"); + +compile( + verilator_flags2 => ['--cc --trace -CFLAGS -DVL_PORTABLE_ONLY'], + ); + +execute( + check_finished => 1, + ); + +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_strp /); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru\(/); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\(/); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\(/); +file_grep ("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\(/); + +vcd_identical ("$Self->{obj_dir}/simx.vcd", "t/t_trace_complex.out"); + +ok(1); +1;