diff --git a/Changes b/Changes index a0eb2b1d1..8c9418c9f 100644 --- a/Changes +++ b/Changes @@ -11,6 +11,10 @@ contributors that suggested a given feature are shown in []. Thanks! Verilator 4.223 devel ========================== +**Major:** + +* VCD tracing is now parallelized with --threads (#3449). [Geza Lore, Shunyao CAD] + **Minor:** * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD] diff --git a/bin/verilator b/bin/verilator index b1ee97e73..40be6ba0f 100755 --- a/bin/verilator +++ b/bin/verilator @@ -405,7 +405,7 @@ detailed descriptions of these arguments. --trace-max-width Maximum array depth for tracing --trace-params Enable tracing of parameters --trace-structs Enable tracing structure names - --trace-threads Enable waveform creation on separate threads + --trace-threads Enable FST waveform creation on separate threads --trace-underscore Enable tracing of _signals -U Undefine preprocessor define --unroll-count Tune maximum loop iterations diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 00e101ed0..6100dcd55 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -1041,7 +1041,8 @@ Summary: is not thread safe. With "--threads 1", the generated model is single threaded but may run in a multithreaded environment. With "--threads N", where N >= 2, the model is generated to run multithreaded on up to N - threads. See :ref:`Multithreading`. + threads. See :ref:`Multithreading`. This option also applies to + :vlopt:`--trace` (but not :vlopt:`--trace-fst`). .. option:: --threads-dpi all @@ -1119,7 +1120,8 @@ Summary: Having tracing compiled in may result in some small performance losses, even when tracing is not turned on during model execution. - See also :vlopt:`--trace-threads` option. + When using :vlopt:`--threads`, VCD tracing is parallelized, using the + same number of threads as passed to :vlopt:`--threads`. .. option:: --trace-coverage @@ -1173,12 +1175,12 @@ Summary: .. option:: --trace-threads *threads* Enable waveform tracing using separate threads. This is typically faster - in simulation runtime but uses more total compute. This option is - independent of, and works with, both :vlopt:`--trace` and - :vlopt:`--trace-fst`. Different trace formats can take advantage of - more trace threads to varying degrees. Currently VCD tracing can utilize - at most "--trace-threads 1", and FST tracing can utilize at most - "--trace-threads 2". This overrides :vlopt:`--no-threads` . + in simulation runtime but uses more total compute. This option only + applies to :vlopt:`--trace-fst`. FST tracing can utilize at most + "--trace-threads 2". This overrides :vlopt:`--no-threads`. + + This option is accepted, but has absolutely no effect with + :vlopt:`--trace`, which respects :vlopt:`--threads` instead. .. option:: --trace-underscore diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst index f443ca298..2af18c1f0 100644 --- a/docs/guide/verilating.rst +++ b/docs/guide/verilating.rst @@ -221,9 +221,13 @@ model, it may be beneficial to performance to adjust the influences the partitioning of the model by adjusting the assumed execution time of DPI imports. -The :vlopt:`--trace-threads` options can be used to produce trace dumps -using multiple threads. If :vlopt:`--trace-threads` is set without -:vlopt:`--threads`, then :vlopt:`--trace-threads` will imply +When using :vlopt:`--trace` to perform VCD tracing, the VCD trace +construction is parallelized using the same number of threads as specified +with :vlopt:`--threads`, and is executed on the same thread pool as the model. + +The :vlopt:`--trace-threads` options can be used with :vlopt:`--trace-fst` +to offload FST tracing using multiple threads. If :vlopt:`--trace-threads` is +given without :vlopt:`--threads`, then :vlopt:`--trace-threads` will imply :vlopt:`--threads 1 <--threads>`, i.e.: the support libraries will be thread safe. @@ -231,12 +235,12 @@ With :vlopt:`--trace-threads 0 <--trace-threads>`, trace dumps are produced on the main thread. This again gives the highest single thread performance. With :vlopt:`--trace-threads {N} <--trace-threads>`, where N is at least 1, -N additional threads will be created and managed by the trace files (e.g.: -VerilatedVcdC or VerilatedFstC), to generate the trace dump. The main -thread will be released to proceed with execution as soon as possible, -though some blocking of the main thread is still necessary while capturing -the trace. Different trace formats can utilize a various number of -threads. See the :vlopt:`--trace-threads` option. +up to N additional threads will be created and managed by the trace files +(e.g.: VerilatedFstC), to offload construction of the trace dump. The main +thread will be released to proceed with execution as soon as possible, though +some blocking of the main thread is still necessary while capturing the +trace. FST tracing can utilize up to 2 offload threads, so there is no use +of setting :vlopt:`--trace-threads` higher than 2 at the moment. When running a multithreaded model, the default Linux task scheduler often works against the model, by assuming threads are short lived, and thus @@ -441,7 +445,7 @@ SystemC include directories and link to the SystemC libraries. .. describe:: TRACE_THREADS - Optional. Generated multi-threaded trace dumping, same as + Optional. Generated multi-threaded FST trace dumping, same as "--trace-threads". .. describe:: TOP_MODULE diff --git a/include/verilated.h b/include/verilated.h index 804d7363a..f9cf79601 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -147,7 +147,7 @@ extern uint32_t VL_THREAD_ID() VL_MT_SAFE; #if VL_THREADED -#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before relaxing +#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before yielding /// Mutex, wrapped to allow -fthread_safety checks class VL_CAPABILITY("mutex") VerilatedMutex final { diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp index 3e1b27744..0bc1048cf 100644 --- a/include/verilated_fst_c.cpp +++ b/include/verilated_fst_c.cpp @@ -83,9 +83,11 @@ static_assert(static_cast(FST_ST_VCD_PROGRAM) == static_cast(VLT_TRACE //============================================================================= // Specialization of the generics for this trace format -#define VL_DERIVED_T VerilatedFst +#define VL_SUB_T VerilatedFst +#define VL_BUF_T VerilatedFstBuffer #include "verilated_trace_imp.h" -#undef VL_DERIVED_T +#undef VL_SUB_T +#undef VL_BUF_T //============================================================================= // VerilatedFst @@ -111,7 +113,7 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) { m_curScope.clear(); - VerilatedTrace::traceInit(); + Super::traceInit(); // Clear the scope stack auto it = m_curScope.begin(); @@ -133,14 +135,14 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) { void VerilatedFst::close() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::closeBase(); + Super::closeBase(); fstWriterClose(m_fst); m_fst = nullptr; } void VerilatedFst::flush() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::flushBase(); + Super::flushBase(); fstWriterFlushContext(m_fst); } @@ -162,7 +164,7 @@ void VerilatedFst::declare(uint32_t code, const char* name, int dtypenum, fstVar int lsb) { const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1; - const bool enabled = VerilatedTrace::declCode(code, name, bits, false); + const bool enabled = Super::declCode(code, name, bits, false); if (!enabled) return; std::string nameasstr = namePrefix() + name; @@ -245,18 +247,42 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst declare(code, name, dtypenum, vardir, vartype, array, arraynum, false, 63, 0); } +//============================================================================= +// Get/commit trace buffer + +VerilatedFstBuffer* VerilatedFst::getTraceBuffer() { return new VerilatedFstBuffer{*this}; } + +void VerilatedFst::commitTraceBuffer(VerilatedFstBuffer* bufp) { +#ifdef VL_TRACE_OFFLOAD + if (bufp->m_offloadBufferWritep) { + m_offloadBufferWritep = bufp->m_offloadBufferWritep; + return; // Buffer will be deleted by the offload thread + } +#endif + delete bufp; +} + +//============================================================================= +// VerilatedFstBuffer implementation + +VerilatedFstBuffer::VerilatedFstBuffer(VerilatedFst& owner) + : VerilatedTraceBuffer{owner} {} + +//============================================================================= +// Trace rendering primitives + // Note: emit* are only ever called from one place (full* in // verilated_trace_imp.h, which is included in this file at the top), // so always inline them. VL_ATTR_ALWINLINE -void VerilatedFst::emitBit(uint32_t code, CData newval) { +void VerilatedFstBuffer::emitBit(uint32_t code, CData newval) { VL_DEBUG_IFDEF(assert(m_symbolp[code]);); fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0"); } VL_ATTR_ALWINLINE -void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) { +void VerilatedFstBuffer::emitCData(uint32_t code, CData newval, int bits) { char buf[VL_BYTESIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits)); @@ -264,7 +290,7 @@ void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) { +void VerilatedFstBuffer::emitSData(uint32_t code, SData newval, int bits) { char buf[VL_SHORTSIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits)); @@ -272,7 +298,7 @@ void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) { +void VerilatedFstBuffer::emitIData(uint32_t code, IData newval, int bits) { char buf[VL_IDATASIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits)); @@ -280,7 +306,7 @@ void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) { +void VerilatedFstBuffer::emitQData(uint32_t code, QData newval, int bits) { char buf[VL_QUADSIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits)); @@ -288,7 +314,7 @@ void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) { +void VerilatedFstBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) { int words = VL_WORDS_I(bits); char* wp = m_strbuf; // Convert the most significant word @@ -304,6 +330,6 @@ void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitDouble(uint32_t code, double newval) { +void VerilatedFstBuffer::emitDouble(uint32_t code, double newval) { fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval); } diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h index b622a1894..5131cc8cc 100644 --- a/include/verilated_fst_c.h +++ b/include/verilated_fst_c.h @@ -31,15 +31,19 @@ #include #include +class VerilatedFstBuffer; + //============================================================================= // VerilatedFst // Base class to create a Verilator FST dump // This is an internally used class - see VerilatedFstC for what to call from applications -class VerilatedFst final : public VerilatedTrace { +class VerilatedFst final : public VerilatedTrace { +public: + using Super = VerilatedTrace; + private: - // Give the superclass access to private bits (to avoid virtual functions) - friend class VerilatedTrace; + friend Buffer; // Give the buffer access to the private bits //========================================================================= // FST specific internals @@ -60,31 +64,26 @@ protected: //========================================================================= // Implementation of VerilatedTrace interface - // Implementations of protected virtual methods for VerilatedTrace + // Called when the trace moves forward to a new time point virtual void emitTimeChange(uint64_t timeui) override; // Hooks called from VerilatedTrace virtual bool preFullDump() override { return isOpen(); } virtual bool preChangeDump() override { return isOpen(); } - // Implementations of duck-typed methods for VerilatedTrace. These are - // called from only one place (namely full*) so always inline them. - inline void emitBit(uint32_t code, CData newval); - inline void emitCData(uint32_t code, CData newval, int bits); - inline void emitSData(uint32_t code, SData newval, int bits); - inline void emitIData(uint32_t code, IData newval, int bits); - inline void emitQData(uint32_t code, QData newval, int bits); - inline void emitWData(uint32_t code, const WData* newvalp, int bits); - inline void emitDouble(uint32_t code, double newval); + // Trace buffer management + virtual VerilatedFstBuffer* getTraceBuffer() override; + virtual void commitTraceBuffer(VerilatedFstBuffer*) override; public: //========================================================================= // External interface to client code - // (All must be threadsafe) + // CONSTRUCTOR explicit VerilatedFst(void* fst = nullptr); ~VerilatedFst(); + // METHODS - All must be thread safe // Open the file; call isOpen() to see if errors void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex); // Close the file @@ -97,11 +96,6 @@ public: //========================================================================= // Internal interface to Verilator generated code - // Inside dumping routines, declare a data type - void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits, - const char** itemNamesp, const char** itemValuesp); - - // Inside dumping routines, declare a signal void declBit(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, fstVarType vartype, bool array, int arraynum); void declBus(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, @@ -112,18 +106,55 @@ public: fstVarType vartype, bool array, int arraynum, int msb, int lsb); void declDouble(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, fstVarType vartype, bool array, int arraynum); + + void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits, + const char** itemNamesp, const char** itemValuesp); }; #ifndef DOXYGEN // Declare specialization here as it's used in VerilatedFstC just below -template <> void VerilatedTrace::dump(uint64_t timeui); -template <> void VerilatedTrace::set_time_unit(const char* unitp); -template <> void VerilatedTrace::set_time_unit(const std::string& unit); -template <> void VerilatedTrace::set_time_resolution(const char* unitp); -template <> void VerilatedTrace::set_time_resolution(const std::string& unit); -template <> void VerilatedTrace::dumpvars(int level, const std::string& hier); +template <> void VerilatedFst::Super::dump(uint64_t time); +template <> void VerilatedFst::Super::set_time_unit(const char* unitp); +template <> void VerilatedFst::Super::set_time_unit(const std::string& unit); +template <> void VerilatedFst::Super::set_time_resolution(const char* unitp); +template <> void VerilatedFst::Super::set_time_resolution(const std::string& unit); +template <> void VerilatedFst::Super::dumpvars(int level, const std::string& hier); #endif +//============================================================================= +// VerilatedFstBuffer + +class VerilatedFstBuffer final : public VerilatedTraceBuffer { + // Give the trace file access to the private bits + friend VerilatedFst; + friend VerilatedFst::Super; + + // The FST file handle + void* const m_fst = m_owner.m_fst; + // code to fstHande map, as an array + const fstHandle* const m_symbolp = m_owner.m_symbolp; + // String buffer long enough to hold maxBits() chars + char* const m_strbuf = m_owner.m_strbuf; + +public: + // CONSTRUCTOR + explicit VerilatedFstBuffer(VerilatedFst& owner); + ~VerilatedFstBuffer() = default; + + //========================================================================= + // Implementation of VerilatedTraceBuffer interface + + // Implementations of duck-typed methods for VerilatedTraceBuffer. These are + // called from only one place (the full* methods), so always inline them. + VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval); + VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits); + VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits); + VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits); + VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits); + VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits); + VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval); +}; + //============================================================================= // VerilatedFstC /// Create a FST dump file in C standalone (no SystemC) simulations. diff --git a/include/verilated_trace.h b/include/verilated_trace.h index 3174ff7c1..7915c3645 100644 --- a/include/verilated_trace.h +++ b/include/verilated_trace.h @@ -22,28 +22,43 @@ #ifndef VERILATOR_VERILATED_TRACE_H_ #define VERILATOR_VERILATED_TRACE_H_ -#ifdef VL_TRACE_THREADED -#define VL_TRACE_OFFLOAD +// clang-format off + +// In FST mode, VL_TRACE_THREADED enables offloading, but only if we also have +// the FST writer thread. This means with --trace-threads 1, we get the FST +// writer thread only, and with --trace-threads 2 we get offloading as well +#if defined(VL_TRACE_FST_WRITER_THREAD) && defined(VL_TRACE_THREADED) +# define VL_TRACE_OFFLOAD +#endif +// VCD tracing can happen fully in parallel +#if defined(VM_TRACE_VCD) && VM_TRACE_VCD && defined(VL_TRACE_THREADED) +# define VL_TRACE_PARALLEL #endif -// clang-format off +#if defined(VL_TRACE_PARALLEL) && defined(VL_TRACE_OFFLOAD) +# error "Cannot have VL_TRACE_PARALLEL and VL_TRACE_OFFLOAD together" +#endif #include "verilated.h" #include "verilated_trace_defs.h" #include +#include #include #include +#include #include #ifdef VL_TRACE_OFFLOAD -# include # include # include #endif // clang-format on +class VlThreadPool; +template class VerilatedTraceBuffer; + #ifdef VL_TRACE_OFFLOAD //============================================================================= // Offloaded tracing @@ -106,7 +121,8 @@ public: CHG_WDATA = 0x6, CHG_DOUBLE = 0x8, // TODO: full.. - TIME_CHANGE = 0xd, + TIME_CHANGE = 0xc, + TRACE_BUFFER = 0xd, END = 0xe, // End of buffer SHUTDOWN = 0xf // Shutdown worker thread, also marks end of buffer }; @@ -116,16 +132,22 @@ public: //============================================================================= // VerilatedTrace -// VerilatedTrace uses F-bounded polymorphism to access duck-typed -// implementations in the format specific derived class, which must be passed -// as the type parameter T_Derived -template class VerilatedTrace VL_NOT_FINAL { +// T_Trace is the format specific subclass of VerilatedTrace. +// T_Buffer is the format specific subclass of VerilatedTraceBuffer. +template class VerilatedTrace VL_NOT_FINAL { + // Give the buffer (both base and derived) access to the private bits + friend VerilatedTraceBuffer; + friend T_Buffer; + public: + using Buffer = T_Buffer; + //========================================================================= // Generic tracing internals - using initCb_t = void (*)(void*, T_Derived*, uint32_t); // Type of init callbacks - using dumpCb_t = void (*)(void*, T_Derived*); // Type of all but init callbacks + using initCb_t = void (*)(void*, T_Trace*, uint32_t); // Type of init callbacks + using dumpCb_t = void (*)(void*, Buffer*); // Type of dump callbacks + using cleanupCb_t = void (*)(void*, T_Trace*); // Type of cleanup callbacks private: struct CallbackRecord { @@ -133,9 +155,10 @@ private: // (the one in Ubuntu 14.04 with GCC 4.8.4 in particular) use the // assignment operator on inserting into collections, so they don't work // with const fields... - union { - initCb_t m_initCb; // The callback function - dumpCb_t m_dumpCb; // The callback function + union { // The callback + initCb_t m_initCb; + dumpCb_t m_dumpCb; + cleanupCb_t m_cleanupCb; }; void* m_userp; // The user pointer to pass to the callback (the symbol table) CallbackRecord(initCb_t cb, void* userp) @@ -144,16 +167,46 @@ private: CallbackRecord(dumpCb_t cb, void* userp) : m_dumpCb{cb} , m_userp{userp} {} + CallbackRecord(cleanupCb_t cb, void* userp) + : m_cleanupCb{cb} + , m_userp{userp} {} }; - uint32_t* m_sigs_oldvalp = nullptr; // Old value store +#ifdef VL_TRACE_PARALLEL + struct ParallelWorkerData { + const dumpCb_t m_cb; // The callback + void* const m_userp; // The use pointer to pass to the callback + Buffer* const m_bufp; // The buffer pointer to pass to the callback + std::atomic m_ready{false}; // The ready flag + mutable VerilatedMutex m_mutex; // Mutex for suspension until ready + std::condition_variable_any m_cv; // Condition variable for suspension + bool m_waiting VL_GUARDED_BY(m_mutex) = false; // Whether a thread is suspended in wait() + + void wait(); + + ParallelWorkerData(dumpCb_t cb, void* userp, Buffer* bufp) + : m_cb{cb} + , m_userp{userp} + , m_bufp{bufp} {} + }; + + // Passed a ParallelWorkerData*, second argument is ignored + static void parallelWorkerTask(void*, bool); +#endif + + using ParallelCallbackMap = std::unordered_map>; + +protected: + uint32_t* m_sigs_oldvalp = nullptr; // Previous value store EData* m_sigs_enabledp = nullptr; // Bit vector of enabled codes (nullptr = all on) +private: uint64_t m_timeLastDump = 0; // Last time we did a dump std::vector m_sigs_enabledVec; // Staging for m_sigs_enabledp - std::vector m_initCbs; // Routines to initialize traciong - std::vector m_fullCbs; // Routines to perform full dump - std::vector m_chgCbs; // Routines to perform incremental dump + std::vector m_initCbs; // Routines to initialize tracing + ParallelCallbackMap m_fullCbs; // Routines to perform full dump + ParallelCallbackMap m_chgCbs; // Routines to perform incremental dump std::vector m_cleanupCbs; // Routines to call at the end of dump + std::vector m_threadPoolps; // All thread pools, in insertion order bool m_fullDump = true; // Whether a full dump is required on the next call to 'dump' uint32_t m_nextCode = 0; // Next code number to assign uint32_t m_numSignals = 0; // Number of distinct signals @@ -164,12 +217,16 @@ private: double m_timeRes = 1e-9; // Time resolution (ns/ms etc) double m_timeUnit = 1e-0; // Time units (ns/ms etc) + void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex); + void addCallbackRecord(std::vector& cbVec, CallbackRecord& cbRec) VL_MT_SAFE_EXCLUDES(m_mutex); - // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->' + // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->' // to access duck-typed functions to avoid a virtual function call. - T_Derived* self() { return static_cast(this); } + T_Trace* self() { return static_cast(this); } + + void runParallelCallbacks(const ParallelCallbackMap& cbMap); // Flush any remaining data for this file static void onFlush(void* selfp) VL_MT_UNSAFE_ONE; @@ -185,10 +242,14 @@ private: VerilatedThreadQueue m_offloadBuffersToWorker; // Buffers returned from worker after processing VerilatedThreadQueue m_offloadBuffersFromWorker; + +protected: // Write pointer into current buffer uint32_t* m_offloadBufferWritep = nullptr; // End of offload buffer uint32_t* m_offloadBufferEndp = nullptr; + +private: // The offload worker thread itself std::unique_ptr m_workerThread; @@ -250,6 +311,10 @@ protected: virtual bool preFullDump() = 0; virtual bool preChangeDump() = 0; + // Trace buffer management + virtual Buffer* getTraceBuffer() = 0; + virtual void commitTraceBuffer(Buffer*) = 0; + public: //========================================================================= // External interface to client code @@ -270,19 +335,55 @@ public: // Call void dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex); + //========================================================================= + // Internal interface to Verilator generated code + //========================================================================= // Non-hot path internal interface to Verilator generated code void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE; - void addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE; - void addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE; - void addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE; + void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE; + void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE; + void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE; void scopeEscape(char flag) { m_scopeEscape = flag; } void pushNamePrefix(const std::string&); void popNamePrefix(unsigned count = 1); +}; +//============================================================================= +// VerilatedTraceBuffer + +// T_Trace is the format specific subclass of VerilatedTrace. +// T_Buffer is the format specific subclass of VerilatedTraceBuffer. +// The format-specific hot-path methods use duck-typing via T_Buffer for performance. +template class VerilatedTraceBuffer VL_NOT_FINAL { + friend T_Trace; // Give the trace file access to the private bits + +protected: + T_Trace& m_owner; // The VerilatedTrace subclass that owns this buffer + + // Previous value store + uint32_t* const m_sigs_oldvalp = m_owner.m_sigs_oldvalp; + // Bit vector of enabled codes (nullptr = all on) + EData* const m_sigs_enabledp = m_owner.m_sigs_enabledp; + +#ifdef VL_TRACE_OFFLOAD + // Write pointer into current buffer + uint32_t* m_offloadBufferWritep = m_owner.m_offloadBufferWritep; + // End of offload buffer + uint32_t* const m_offloadBufferEndp = m_owner.m_offloadBufferEndp; +#endif + + // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->' + // to access duck-typed functions to avoid a virtual function call. + inline T_Buffer* self() { return static_cast(this); } + + explicit VerilatedTraceBuffer(T_Trace& owner); + virtual ~VerilatedTraceBuffer() = default; + +public: //========================================================================= // Hot path internal interface to Verilator generated code @@ -363,9 +464,13 @@ public: VL_DEBUG_IF(assert(m_offloadBufferWritep <= m_offloadBufferEndp);); } -#define CHG(name) chg##name##Impl -#else -#define CHG(name) chg##name +#define chgBit chgBitImpl +#define chgCData chgCDataImpl +#define chgSData chgSDataImpl +#define chgIData chgIDataImpl +#define chgQData chgQDataImpl +#define chgWData chgWDataImpl +#define chgDouble chgDoubleImpl #endif // In non-offload mode, these are called directly by the trace callbacks, @@ -373,27 +478,27 @@ public: // thread and are called chg*Impl // Check previous dumped value of signal. If changed, then emit trace entry - VL_ATTR_ALWINLINE inline void CHG(Bit)(uint32_t* oldp, CData newval) { + VL_ATTR_ALWINLINE inline void chgBit(uint32_t* oldp, CData newval) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullBit(oldp, newval); } - VL_ATTR_ALWINLINE inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgCData(uint32_t* oldp, CData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits); } - VL_ATTR_ALWINLINE inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgSData(uint32_t* oldp, SData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits); } - VL_ATTR_ALWINLINE inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgIData(uint32_t* oldp, IData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits); } - VL_ATTR_ALWINLINE inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgQData(uint32_t* oldp, QData newval, int bits) { const uint64_t diff = *reinterpret_cast(oldp) ^ newval; if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits); } - inline void CHG(WData)(uint32_t* oldp, const WData* newvalp, int bits) { + VL_ATTR_ALWINLINE inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) { for (int i = 0; i < (bits + 31) / 32; ++i) { if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) { fullWData(oldp, newvalp, bits); @@ -401,11 +506,20 @@ public: } } } - VL_ATTR_ALWINLINE inline void CHG(Double)(uint32_t* oldp, double newval) { + VL_ATTR_ALWINLINE inline void chgDouble(uint32_t* oldp, double newval) { // cppcheck-suppress invalidPointerCast if (VL_UNLIKELY(*reinterpret_cast(oldp) != newval)) fullDouble(oldp, newval); } -#undef CHG +#ifdef VL_TRACE_OFFLOAD +#undef chgBit +#undef chgCData +#undef chgSData +#undef chgIData +#undef chgQData +#undef chgWData +#undef chgDouble +#endif }; + #endif // guard diff --git a/include/verilated_trace_imp.h b/include/verilated_trace_imp.h index e62e40cab..d2ffa965c 100644 --- a/include/verilated_trace_imp.h +++ b/include/verilated_trace_imp.h @@ -20,12 +20,16 @@ // clang-format off #ifndef VL_CPPCHECK -#ifndef VL_DERIVED_T +#if !defined(VL_SUB_T) || !defined(VL_BUF_T) # error "This file should be included in trace format implementations" #endif #include "verilated_intrinsics.h" #include "verilated_trace.h" +#ifdef VL_TRACE_PARALLEL +# include "verilated_threads.h" +# include +#endif #if 0 # include @@ -78,7 +82,7 @@ static std::string doubleToTimescale(double value) { //========================================================================= // Buffer management -template <> uint32_t* VerilatedTrace::getOffloadBuffer() { +template <> uint32_t* VerilatedTrace::getOffloadBuffer() { uint32_t* bufferp; // Some jitter is expected, so some number of alternative offlaod buffers are // required, but don't allocate more than 8 buffers. @@ -97,7 +101,7 @@ template <> uint32_t* VerilatedTrace::getOffloadBuffer() { return bufferp; } -template <> void VerilatedTrace::waitForOffloadBuffer(const uint32_t* buffp) { +template <> void VerilatedTrace::waitForOffloadBuffer(const uint32_t* buffp) { // Slow path code only called on flush/shutdown, so use a simple algorithm. // Collect buffers from worker and stash them until we get the one we want. std::deque stash; @@ -112,7 +116,7 @@ template <> void VerilatedTrace::waitForOffloadBuffer(const uint32 //========================================================================= // Worker thread -template <> void VerilatedTrace::offloadWorkerThreadMain() { +template <> void VerilatedTrace::offloadWorkerThreadMain() { bool shutdown = false; do { @@ -123,6 +127,8 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { const uint32_t* readp = bufferp; + std::unique_ptr traceBufp; // We own the passed tracebuffer + while (true) { const uint32_t cmd = readp[0]; const uint32_t top = cmd >> 4; @@ -137,44 +143,44 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { // CHG_* commands case VerilatedTraceOffloadCommand::CHG_BIT_0: VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top); - chgBitImpl(oldp, 0); + traceBufp->chgBitImpl(oldp, 0); continue; case VerilatedTraceOffloadCommand::CHG_BIT_1: VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top); - chgBitImpl(oldp, 1); + traceBufp->chgBitImpl(oldp, 1); continue; case VerilatedTraceOffloadCommand::CHG_CDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top); // Bits stored in bottom byte of command - chgCDataImpl(oldp, *readp, top); + traceBufp->chgCDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_SDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top); // Bits stored in bottom byte of command - chgSDataImpl(oldp, *readp, top); + traceBufp->chgSDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_IDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top); // Bits stored in bottom byte of command - chgIDataImpl(oldp, *readp, top); + traceBufp->chgIDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_QDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top); // Bits stored in bottom byte of command - chgQDataImpl(oldp, *reinterpret_cast(readp), top); + traceBufp->chgQDataImpl(oldp, *reinterpret_cast(readp), top); readp += 2; continue; case VerilatedTraceOffloadCommand::CHG_WDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top); - chgWDataImpl(oldp, readp, top); + traceBufp->chgWDataImpl(oldp, readp, top); readp += VL_WORDS_I(top); continue; case VerilatedTraceOffloadCommand::CHG_DOUBLE: VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top); - chgDoubleImpl(oldp, *reinterpret_cast(readp)); + traceBufp->chgDoubleImpl(oldp, *reinterpret_cast(readp)); readp += 2; continue; @@ -187,9 +193,18 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { readp += 2; continue; + case VerilatedTraceOffloadCommand::TRACE_BUFFER: + VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top); + readp -= 1; // No code in this command, undo increment + traceBufp.reset(*reinterpret_cast(readp)); + readp += 2; + continue; + //=== // Commands ending this buffer - case VerilatedTraceOffloadCommand::END: VL_TRACE_OFFLOAD_DEBUG("Command END"); break; + case VerilatedTraceOffloadCommand::END: // + VL_TRACE_OFFLOAD_DEBUG("Command END"); + break; case VerilatedTraceOffloadCommand::SHUTDOWN: VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN"); shutdown = true; @@ -198,8 +213,7 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { //=== // Unknown command default: { // LCOV_EXCL_START - VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN"); - VL_PRINTF_MT("Trace command: 0x%08x\n", cmd); + VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd); VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command"); break; } // LCOV_EXCL_STOP @@ -217,7 +231,7 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { } while (VL_LIKELY(!shutdown)); } -template <> void VerilatedTrace::shutdownOffloadWorker() { +template <> void VerilatedTrace::shutdownOffloadWorker() { // If the worker thread is not running, done.. if (!m_workerThread) return; @@ -237,7 +251,7 @@ template <> void VerilatedTrace::shutdownOffloadWorker() { //============================================================================= // Life cycle -template <> void VerilatedTrace::closeBase() { +template <> void VerilatedTrace::closeBase() { #ifdef VL_TRACE_OFFLOAD shutdownOffloadWorker(); while (m_numOffloadBuffers) { @@ -247,7 +261,7 @@ template <> void VerilatedTrace::closeBase() { #endif } -template <> void VerilatedTrace::flushBase() { +template <> void VerilatedTrace::flushBase() { #ifdef VL_TRACE_OFFLOAD // Hand an empty buffer to the worker thread uint32_t* const bufferp = getOffloadBuffer(); @@ -262,29 +276,29 @@ template <> void VerilatedTrace::flushBase() { //============================================================================= // Callbacks to run on global events -template <> void VerilatedTrace::onFlush(void* selfp) { +template <> void VerilatedTrace::onFlush(void* selfp) { // This calls 'flush' on the derived class (which must then get any mutex) - reinterpret_cast(selfp)->flush(); + reinterpret_cast(selfp)->flush(); } -template <> void VerilatedTrace::onExit(void* selfp) { +template <> void VerilatedTrace::onExit(void* selfp) { // This calls 'close' on the derived class (which must then get any mutex) - reinterpret_cast(selfp)->close(); + reinterpret_cast(selfp)->close(); } //============================================================================= // VerilatedTrace -template <> VerilatedTrace::VerilatedTrace() { +template <> VerilatedTrace::VerilatedTrace() { set_time_unit(Verilated::threadContextp()->timeunitString()); set_time_resolution(Verilated::threadContextp()->timeprecisionString()); } -template <> VerilatedTrace::~VerilatedTrace() { +template <> VerilatedTrace::~VerilatedTrace() { if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr); if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr); - Verilated::removeFlushCb(VerilatedTrace::onFlush, this); - Verilated::removeExitCb(VerilatedTrace::onExit, this); + Verilated::removeFlushCb(VerilatedTrace::onFlush, this); + Verilated::removeExitCb(VerilatedTrace::onExit, this); #ifdef VL_TRACE_OFFLOAD closeBase(); #endif @@ -293,7 +307,7 @@ template <> VerilatedTrace::~VerilatedTrace() { //========================================================================= // Internals available to format specific implementations -template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { +template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { // Note: It is possible to re-open a trace file (VCD in particular), // so we must reset the next code here, but it must have the same number // of codes on re-open @@ -338,8 +352,8 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { } // Set callback so flush/abort will flush this file - Verilated::addFlushCb(VerilatedTrace::onFlush, this); - Verilated::addExitCb(VerilatedTrace::onExit, this); + Verilated::addFlushCb(VerilatedTrace::onFlush, this); + Verilated::addExitCb(VerilatedTrace::onExit, this); #ifdef VL_TRACE_OFFLOAD // Compute offload buffer size. we need to be able to store a new value for @@ -351,13 +365,13 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { // Start the worker thread m_workerThread.reset( - new std::thread{&VerilatedTrace::offloadWorkerThreadMain, this}); + new std::thread{&VerilatedTrace::offloadWorkerThreadMain, this}); #endif } template <> -bool VerilatedTrace::declCode(uint32_t code, const char* namep, uint32_t bits, - bool tri) { +bool VerilatedTrace::declCode(uint32_t code, const char* namep, uint32_t bits, + bool tri) { if (VL_UNCOVERABLE(!code)) { VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal"); } @@ -401,28 +415,30 @@ bool VerilatedTrace::declCode(uint32_t code, const char* namep, ui //========================================================================= // Internals available to format specific implementations -template <> std::string VerilatedTrace::timeResStr() const { +template <> std::string VerilatedTrace::timeResStr() const { return doubleToTimescale(m_timeRes); } //========================================================================= // External interface to client code -template <> void VerilatedTrace::set_time_unit(const char* unitp) VL_MT_SAFE { +template <> void VerilatedTrace::set_time_unit(const char* unitp) VL_MT_SAFE { m_timeUnit = timescaleToDouble(unitp); } -template <> void VerilatedTrace::set_time_unit(const std::string& unit) VL_MT_SAFE { +template <> +void VerilatedTrace::set_time_unit(const std::string& unit) VL_MT_SAFE { set_time_unit(unit.c_str()); } -template <> void VerilatedTrace::set_time_resolution(const char* unitp) VL_MT_SAFE { +template <> +void VerilatedTrace::set_time_resolution(const char* unitp) VL_MT_SAFE { m_timeRes = timescaleToDouble(unitp); } template <> -void VerilatedTrace::set_time_resolution(const std::string& unit) VL_MT_SAFE { +void VerilatedTrace::set_time_resolution(const std::string& unit) VL_MT_SAFE { set_time_resolution(unit.c_str()); } template <> -void VerilatedTrace::dumpvars(int level, const std::string& hier) VL_MT_SAFE { +void VerilatedTrace::dumpvars(int level, const std::string& hier) VL_MT_SAFE { if (level == 0) { m_dumpvars.clear(); // empty = everything on } else { @@ -435,7 +451,87 @@ void VerilatedTrace::dumpvars(int level, const std::string& hier) } } -template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) { +#ifdef VL_TRACE_PARALLEL +template <> // +void VerilatedTrace::parallelWorkerTask(void* datap, bool) { + ParallelWorkerData* const wdp = reinterpret_cast(datap); + // Run the task + wdp->m_cb(wdp->m_userp, wdp->m_bufp); + // Mark buffer as ready + const VerilatedLockGuard lock{wdp->m_mutex}; + wdp->m_ready.store(true); + if (wdp->m_waiting) wdp->m_cv.notify_one(); +} + +template <> VL_ATTR_NOINLINE void VerilatedTrace::ParallelWorkerData::wait() { + // Spin for a while, waiting for the buffer to become ready + for (int i = 0; i < VL_LOCK_SPINS; ++i) { + if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return; + VL_CPU_RELAX(); + } + // We have been spinning for a while, so yield the thread + VerilatedLockGuard lock{m_mutex}; + m_waiting = true; + m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); }); + m_waiting = false; +} +#endif + +template <> +void VerilatedTrace::runParallelCallbacks(const ParallelCallbackMap& cbMap) { + for (VlThreadPool* threadPoolp : m_threadPoolps) { +#ifdef VL_TRACE_PARALLEL + // If tracing in parallel, dispatch to the thread pool (if exists) + if (threadPoolp && threadPoolp->numThreads()) { + // List of work items for thread (std::list, as ParallelWorkerData is not movable) + std::list workerData; + // We use the whole pool + the main thread + const unsigned threads = threadPoolp->numThreads() + 1; + // Main thread executes all jobs with index % threads == 0 + std::vector mainThreadWorkerData; + // The tracing callbacks to execute on this thread-pool + const auto& cbVec = cbMap.at(threadPoolp); + // Enuque all the jobs + for (unsigned i = 0; i < cbVec.size(); ++i) { + const CallbackRecord& cbr = cbVec[i]; + // Always get the trace buffer on the main thread + Buffer* const bufp = getTraceBuffer(); + // Create new work item + workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp); + // Grab the new work item + ParallelWorkerData* const itemp = &workerData.back(); + // Enqueue task to thread pool, or main thread + if (unsigned rem = i % threads) { + threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false); + } else { + mainThreadWorkerData.push_back(itemp); + } + } + // Execute main thead jobs + for (ParallelWorkerData* const itemp : mainThreadWorkerData) { + parallelWorkerTask(itemp, false); + } + // Commit all trace buffers in order + for (ParallelWorkerData& item : workerData) { + // Wait until ready + item.wait(); + // Commit the buffer + commitTraceBuffer(item.m_bufp); + } + continue; + } +#endif + // Fall back on sequential execution + for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) { + Buffer* const traceBufferp = getTraceBuffer(); + cbr.m_dumpCb(cbr.m_userp, traceBufferp); + commitTraceBuffer(traceBufferp); + } + } +} + +template <> +void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) { // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE. // This does get the mutex, but if multiple threads are trying to dump // chances are the data being dumped will have other problems @@ -483,20 +579,14 @@ template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_ // Run the callbacks if (VL_UNLIKELY(m_fullDump)) { m_fullDump = false; // No more need for next dump to be full - for (uint32_t i = 0; i < m_fullCbs.size(); ++i) { - const CallbackRecord& cbr = m_fullCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); - } + runParallelCallbacks(m_fullCbs); } else { - for (uint32_t i = 0; i < m_chgCbs.size(); ++i) { - const CallbackRecord& cbr = m_chgCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); - } + runParallelCallbacks(m_chgCbs); } for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) { const CallbackRecord& cbr = m_cleanupCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); + cbr.m_cleanupCb(cbr.m_userp, self()); } #ifdef VL_TRACE_OFFLOAD @@ -517,8 +607,18 @@ template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_ // Non-hot path internal interface to Verilator generated code template <> -void VerilatedTrace::addCallbackRecord(std::vector& cbVec, - CallbackRecord& cbRec) +void VerilatedTrace::addThreadPool(VlThreadPool* threadPoolp) + VL_MT_SAFE_EXCLUDES(m_mutex) { + const VerilatedLockGuard lock{m_mutex}; + for (VlThreadPool* const poolp : m_threadPoolps) { + if (poolp == threadPoolp) return; + } + m_threadPoolps.push_back(threadPoolp); +} + +template <> +void VerilatedTrace::addCallbackRecord(std::vector& cbVec, + CallbackRecord& cbRec) VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; if (VL_UNCOVERABLE(timeLastDump() != 0)) { // LCOV_EXCL_START @@ -529,91 +629,40 @@ void VerilatedTrace::addCallbackRecord(std::vector cbVec.push_back(cbRec); } -template <> void VerilatedTrace::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; addCallbackRecord(m_initCbs, cbr); } -template <> void VerilatedTrace::addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addFullCb(dumpCb_t cb, void* userp, + VlThreadPool* threadPoolp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; - addCallbackRecord(m_fullCbs, cbr); + addThreadPool(threadPoolp); + addCallbackRecord(m_fullCbs[threadPoolp], cbr); } -template <> void VerilatedTrace::addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addChgCb(dumpCb_t cb, void* userp, + VlThreadPool* threadPoolp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; - addCallbackRecord(m_chgCbs, cbr); + addThreadPool(threadPoolp); + addCallbackRecord(m_chgCbs[threadPoolp], cbr); } -template <> void VerilatedTrace::addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; addCallbackRecord(m_cleanupCbs, cbr); } -template <> void VerilatedTrace::pushNamePrefix(const std::string& prefix) { +template <> void VerilatedTrace::pushNamePrefix(const std::string& prefix) { m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix); } -template <> void VerilatedTrace::popNamePrefix(unsigned count) { +template <> void VerilatedTrace::popNamePrefix(unsigned count) { while (count--) m_namePrefixStack.pop_back(); assert(!m_namePrefixStack.empty()); } -//========================================================================= -// Hot path internal interface to Verilator generated code - -// These functions must write the new value back into the old value store, -// and subsequently call the format specific emit* implementations. Note -// that this file must be included in the format specific implementation, so -// the emit* functions can be inlined for performance. - -template <> void VerilatedTrace::fullBit(uint32_t* oldp, CData newval) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitBit(code, newval); -} - -template <> void VerilatedTrace::fullCData(uint32_t* oldp, CData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitCData(code, newval, bits); -} - -template <> void VerilatedTrace::fullSData(uint32_t* oldp, SData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitSData(code, newval, bits); -} - -template <> void VerilatedTrace::fullIData(uint32_t* oldp, IData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitIData(code, newval, bits); -} - -template <> void VerilatedTrace::fullQData(uint32_t* oldp, QData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *reinterpret_cast(oldp) = newval; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitQData(code, newval, bits); -} - -template <> -void VerilatedTrace::fullWData(uint32_t* oldp, const WData* newvalp, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i]; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitWData(code, newvalp, bits); -} - -template <> void VerilatedTrace::fullDouble(uint32_t* oldp, double newval) { - const uint32_t code = oldp - m_sigs_oldvalp; - *reinterpret_cast(oldp) = newval; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - // cppcheck-suppress invalidPointerCast - self()->emitDouble(code, newval); -} - //========================================================================= // Primitives converting binary values to strings... @@ -704,4 +753,86 @@ static inline void cvtQDataToStr(char* dstp, QData value) { #define cvtEDataToStr cvtIDataToStr +//========================================================================= +// VerilatedTraceBuffer + +template <> // +VerilatedTraceBuffer::VerilatedTraceBuffer(VL_SUB_T& owner) + : m_owner{owner} { +#ifdef VL_TRACE_OFFLOAD + if (m_offloadBufferWritep) { + using This = VerilatedTraceBuffer*; + // Tack on the buffer address + static_assert(2 * sizeof(uint32_t) >= sizeof(This), + "This should be enough on all plafrorms"); + *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER; + *reinterpret_cast(m_offloadBufferWritep) = this; + m_offloadBufferWritep += 2; + } +#endif +} + +// These functions must write the new value back into the old value store, +// and subsequently call the format specific emit* implementations. Note +// that this file must be included in the format specific implementation, so +// the emit* functions can be inlined for performance. + +template <> // +void VerilatedTraceBuffer::fullBit(uint32_t* oldp, CData newval) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitBit(code, newval); +} + +template <> +void VerilatedTraceBuffer::fullCData(uint32_t* oldp, CData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitCData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullSData(uint32_t* oldp, SData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitSData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullIData(uint32_t* oldp, IData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitIData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullQData(uint32_t* oldp, QData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *reinterpret_cast(oldp) = newval; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitQData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullWData(uint32_t* oldp, const WData* newvalp, + int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i]; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitWData(code, newvalp, bits); +} + +template <> +void VerilatedTraceBuffer::fullDouble(uint32_t* oldp, double newval) { + const uint32_t code = oldp - m_sigs_oldvalp; + *reinterpret_cast(oldp) = newval; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + // cppcheck-suppress invalidPointerCast + self()->emitDouble(code, newval); +} + #endif // VL_CPPCHECK diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp index 8e0008e3f..9db71aabc 100644 --- a/include/verilated_vcd_c.cpp +++ b/include/verilated_vcd_c.cpp @@ -62,12 +62,23 @@ constexpr unsigned VL_TRACE_MAX_VCD_CODE_SIZE = 5; // Maximum length of a VCD s // cache-lines. constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8; // Size of a suffix entry +//============================================================================= +// Utility functions: TODO: put these in a common place and share them. + +template static size_t roundUpToMultipleOf(size_t value) { + static_assert((N & (N - 1)) == 0, "'N' must be a power of 2"); + size_t mask = N - 1; + return (value + mask) & ~mask; +} + //============================================================================= // Specialization of the generics for this trace format -#define VL_DERIVED_T VerilatedVcd +#define VL_SUB_T VerilatedVcd +#define VL_BUF_T VerilatedVcdBuffer #include "verilated_trace_imp.h" -#undef VL_DERIVED_T +#undef VL_SUB_T +#undef VL_BUF_T //============================================================================= //============================================================================= @@ -183,7 +194,7 @@ void VerilatedVcd::makeNameMap() { deleteNameMap(); m_namemapp = new NameMap; - VerilatedTrace::traceInit(); + Super::traceInit(); // Though not speced, it's illegal to generate a vcd with signals // not under any module - it crashes at least two viewers. @@ -218,13 +229,17 @@ VerilatedVcd::~VerilatedVcd() { if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr); deleteNameMap(); if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr); +#ifdef VL_TRACE_PARALLEL + assert(m_numBuffers == m_freeBuffers.size()); + for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr); +#endif } void VerilatedVcd::closePrev() { // This function is on the flush() call path if (!isOpen()) return; - VerilatedTrace::flushBase(); + Super::flushBase(); bufferFlush(); m_isOpen = false; m_filep->close(); @@ -251,14 +266,14 @@ void VerilatedVcd::close() VL_MT_SAFE_EXCLUDES(m_mutex) { printStr(" $end\n"); } closePrev(); - // closePrev() called VerilatedTrace::flush(), so we just + // closePrev() called Super::flush(), so we just // need to shut down the tracing thread here. - VerilatedTrace::closeBase(); + Super::closeBase(); } void VerilatedVcd::flush() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::flushBase(); + Super::flushBase(); bufferFlush(); } @@ -277,12 +292,12 @@ void VerilatedVcd::printQuad(uint64_t n) { printStr(buf); } -void VerilatedVcd::bufferResize(uint64_t minsize) { +void VerilatedVcd::bufferResize(size_t minsize) { // minsize is size of largest write. We buffer at least 8 times as much data, // writing when we are 3/4 full (with thus 2*minsize remaining free) if (VL_UNLIKELY(minsize > m_wrChunkSize)) { const char* oldbufp = m_wrBufp; - m_wrChunkSize = minsize * 2; + m_wrChunkSize = roundUpToMultipleOf<1024>(minsize * 2); m_wrBufp = new char[m_wrChunkSize * 8]; std::memcpy(m_wrBufp, oldbufp, m_writep - oldbufp); m_writep = m_wrBufp + (m_writep - oldbufp); @@ -463,14 +478,16 @@ void VerilatedVcd::declare(uint32_t code, const char* name, const char* wirep, b int arraynum, bool tri, bool bussed, int msb, int lsb) { const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1; - const bool enabled = VerilatedTrace::declCode(code, name, bits, tri); + const bool enabled = Super::declCode(code, name, bits, tri); if (m_suffixes.size() <= nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE) { m_suffixes.resize(nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0); } - // Make sure write buffer is large enough (one character per bit), plus header - bufferResize(bits + 1024); + // Keep upper bound on bytes a single signal cna emit into the buffer + m_maxSignalBytes = std::max(m_maxSignalBytes, bits + 32); + // Make sure write buffer is large enough, plus header + bufferResize(m_maxSignalBytes + 1024); if (!enabled) return; @@ -564,7 +581,71 @@ void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int a } //============================================================================= -// Trace rendering prinitives +// Get/commit trace buffer + +VerilatedVcdBuffer* VerilatedVcd::getTraceBuffer() { +#ifdef VL_TRACE_PARALLEL + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // If no buffer available, allocate a new one + if (m_freeBuffers.empty()) { + constexpr size_t pageSize = 4096; + // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety + size_t startingSize = roundUpToMultipleOf(4 * m_maxSignalBytes); + m_freeBuffers.emplace_back(new char[startingSize], startingSize); + ++m_numBuffers; + } + // Grab a buffer + const auto pair = m_freeBuffers.back(); + m_freeBuffers.pop_back(); + // Return the buffer + return new VerilatedVcdBuffer{*this, pair.first, pair.second}; +#else + return new VerilatedVcdBuffer{*this}; +#endif +} + +void VerilatedVcd::commitTraceBuffer(VerilatedVcdBuffer* bufp) { +#ifdef VL_TRACE_PARALLEL + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // Resize output buffer. Note, we use the full size of the trace buffer, as + // this is a lot more stable than the actual occupancy of the trace buffer. + // This helps us to avoid re-allocations due to small size changes. + bufferResize(bufp->m_size); + // Compute occupancy of buffer + const size_t usedSize = bufp->m_writep - bufp->m_bufp; + // Copy to output buffer + std::memcpy(m_writep, bufp->m_bufp, usedSize); + // Adjust write pointer + m_writep += usedSize; + // Flush if necessary + bufferCheck(); + // Put buffer back on free list + m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size); +#else + // Needs adjusting for emitTimeChange + m_writep = bufp->m_writep; +#endif + delete bufp; +} + +//============================================================================= +// VerilatedVcdBuffer implementation + +#ifdef VL_TRACE_PARALLEL +VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size) + : VerilatedTraceBuffer{owner} + , m_writep{bufp} + , m_bufp{bufp} + , m_size{size} { + adjustGrowp(); +} +#else +VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner) + : VerilatedTraceBuffer{owner} {} +#endif + +//============================================================================= +// Trace rendering primitives static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* suffixp) VL_ATTR_NO_SANITIZE_ALIGN; @@ -589,15 +670,44 @@ static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* s #endif } -void VerilatedVcd::finishLine(uint32_t code, char* writep) { - const char* const suffixp = m_suffixes.data() + code * VL_TRACE_SUFFIX_ENTRY_SIZE; +void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) { + const char* const suffixp = m_suffixes + code * VL_TRACE_SUFFIX_ENTRY_SIZE; VL_DEBUG_IFDEF(assert(suffixp[0]);); VerilatedVcdCCopyAndAppendNewLine(writep, suffixp); // Now write back the write pointer incremented by the actual size of the // suffix, which was stored in the last byte of the suffix buffer entry. m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1]; - bufferCheck(); + +#ifdef VL_TRACE_PARALLEL + // Double the size of the buffer if necessary + if (VL_UNLIKELY(m_writep >= m_growp)) { + // Compute occupied size of current buffer + const size_t usedSize = m_writep - m_bufp; + // We are always doubling the size + m_size *= 2; + // Allocate the new buffer + char* const newBufp = new char[m_size]; + // Copy from current buffer to new buffer + std::memcpy(newBufp, m_bufp, usedSize); + // Delete current buffer + delete[] m_bufp; + // Make new buffer the current buffer + m_bufp = newBufp; + // Adjust write pointer + m_writep = m_bufp + usedSize; + // Adjust resize limit + adjustGrowp(); + } +#else + // Flush the write buffer if there's not enough space left for new information + // We only call this once per vector, so we need enough slop for a very wide "b###" line + if (VL_UNLIKELY(m_writep > m_wrFlushp)) { + m_owner.m_writep = m_writep; + m_owner.bufferFlush(); + m_writep = m_owner.m_writep; + } +#endif } //============================================================================= @@ -608,7 +718,7 @@ void VerilatedVcd::finishLine(uint32_t code, char* writep) { // so always inline them. VL_ATTR_ALWINLINE -void VerilatedVcd::emitBit(uint32_t code, CData newval) { +void VerilatedVcdBuffer::emitBit(uint32_t code, CData newval) { // Don't prefetch suffix as it's a bit too late; char* wp = m_writep; *wp++ = '0' | static_cast(newval); @@ -616,7 +726,7 @@ void VerilatedVcd::emitBit(uint32_t code, CData newval) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) { +void VerilatedVcdBuffer::emitCData(uint32_t code, CData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits)); @@ -624,7 +734,7 @@ void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) { +void VerilatedVcdBuffer::emitSData(uint32_t code, SData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits)); @@ -632,7 +742,7 @@ void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) { +void VerilatedVcdBuffer::emitIData(uint32_t code, IData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits)); @@ -640,7 +750,7 @@ void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) { +void VerilatedVcdBuffer::emitQData(uint32_t code, QData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits)); @@ -648,7 +758,7 @@ void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) { +void VerilatedVcdBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) { int words = VL_WORDS_I(bits); char* wp = m_writep; *wp++ = 'b'; @@ -665,10 +775,10 @@ void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitDouble(uint32_t code, double newval) { +void VerilatedVcdBuffer::emitDouble(uint32_t code, double newval) { char* wp = m_writep; // Buffer can't overflow before VL_SNPRINTF; we sized during declaration - VL_SNPRINTF(wp, m_wrChunkSize, "r%.16g", newval); + VL_SNPRINTF(wp, m_maxSignalBytes, "r%.16g", newval); wp += std::strlen(wp); finishLine(code, wp); } diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h index b1485e13b..0d83eb25d 100644 --- a/include/verilated_vcd_c.h +++ b/include/verilated_vcd_c.h @@ -28,39 +28,20 @@ #include #include -class VerilatedVcd; - -//============================================================================= -// VerilatedFile -/// Class representing a file to write to. These virtual methods can be -/// overrode for e.g. socket I/O. - -class VerilatedVcdFile VL_NOT_FINAL { -private: - int m_fd = 0; // File descriptor we're writing to -public: - // METHODS - /// Construct a (as yet) closed file - VerilatedVcdFile() = default; - /// Close and destruct - virtual ~VerilatedVcdFile() = default; - /// Open a file with given filename - virtual bool open(const std::string& name) VL_MT_UNSAFE; - /// Close object's file - virtual void close() VL_MT_UNSAFE; - /// Write data to file (if it is open) - virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE; -}; +class VerilatedVcdBuffer; +class VerilatedVcdFile; //============================================================================= // VerilatedVcd // Base class to create a Verilator VCD dump // This is an internally used class - see VerilatedVcdC for what to call from applications -class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace { +class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace { +public: + using Super = VerilatedTrace; + private: - // Give the superclass access to private bits (to avoid virtual functions) - friend class VerilatedTrace; + friend Buffer; // Give the buffer access to the private bits //========================================================================= // VCD specific internals @@ -74,9 +55,10 @@ private: int m_modDepth = 0; // Depth of module hierarchy char* m_wrBufp; // Output buffer - const char* m_wrFlushp; // Output buffer flush trigger location + char* m_wrFlushp; // Output buffer flush trigger location char* m_writep; // Write pointer into output buffer - uint64_t m_wrChunkSize; // Output buffer size + size_t m_wrChunkSize; // Output buffer size + size_t m_maxSignalBytes = 0; // Upper bound on number of bytes a single signal can generate uint64_t m_wroteBytes = 0; // Number of bytes written to this file std::vector m_suffixes; // VCD line end string codes + metadata @@ -84,7 +66,13 @@ private: using NameMap = std::map; NameMap* m_namemapp = nullptr; // List of names for the header - void bufferResize(uint64_t minsize); +#ifdef VL_TRACE_PARALLEL + // Vector of free trace buffers as (pointer, size) pairs. + std::vector> m_freeBuffers; + size_t m_numBuffers = 0; // Number of trace buffers allocated +#endif + + void bufferResize(size_t minsize); void bufferFlush() VL_MT_UNSAFE_ONE; inline void bufferCheck() { // Flush the write buffer if there's not enough space left for new information @@ -107,8 +95,6 @@ private: static char* writeCode(char* writep, uint32_t code); - void finishLine(uint32_t code, char* writep); - // CONSTRUCTORS VL_UNCOPYABLE(VerilatedVcd); @@ -116,27 +102,22 @@ protected: //========================================================================= // Implementation of VerilatedTrace interface - // Implementations of protected virtual methods for VerilatedTrace + // Called when the trace moves forward to a new time point virtual void emitTimeChange(uint64_t timeui) override; // Hooks called from VerilatedTrace virtual bool preFullDump() override { return isOpen(); } virtual bool preChangeDump() override; - // Implementations of duck-typed methods for VerilatedTrace. These are - // called from only one place (namely full*) so always inline them. - inline void emitBit(uint32_t code, CData newval); - inline void emitCData(uint32_t code, CData newval, int bits); - inline void emitSData(uint32_t code, SData newval, int bits); - inline void emitIData(uint32_t code, IData newval, int bits); - inline void emitQData(uint32_t code, QData newval, int bits); - inline void emitWData(uint32_t code, const WData* newvalp, int bits); - inline void emitDouble(uint32_t code, double newval); + // Trace buffer management + virtual VerilatedVcdBuffer* getTraceBuffer() override; + virtual void commitTraceBuffer(VerilatedVcdBuffer*) override; public: //========================================================================= // External interface to client code + // CONSTRUCTOR explicit VerilatedVcd(VerilatedVcdFile* filep = nullptr); ~VerilatedVcd(); @@ -144,7 +125,7 @@ public: // Set size in megabytes after which new file should be created void rolloverMB(uint64_t rolloverMB) { m_rolloverMB = rolloverMB; } - // METHODS + // METHODS - All must be thread safe // Open the file; call isOpen() to see if errors void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex); // Open next data-only file @@ -167,15 +148,92 @@ public: }; #ifndef DOXYGEN -// Declare specializations here they are used in VerilatedVcdC just below -template <> void VerilatedTrace::dump(uint64_t timeui); -template <> void VerilatedTrace::set_time_unit(const char* unitp); -template <> void VerilatedTrace::set_time_unit(const std::string& unit); -template <> void VerilatedTrace::set_time_resolution(const char* unitp); -template <> void VerilatedTrace::set_time_resolution(const std::string& unit); -template <> void VerilatedTrace::dumpvars(int level, const std::string& hier); +// Declare specialization here as it's used in VerilatedFstC just below +template <> void VerilatedVcd::Super::dump(uint64_t time); +template <> void VerilatedVcd::Super::set_time_unit(const char* unitp); +template <> void VerilatedVcd::Super::set_time_unit(const std::string& unit); +template <> void VerilatedVcd::Super::set_time_resolution(const char* unitp); +template <> void VerilatedVcd::Super::set_time_resolution(const std::string& unit); +template <> void VerilatedVcd::Super::dumpvars(int level, const std::string& hier); #endif // DOXYGEN +//============================================================================= +// VerilatedVcdBuffer + +class VerilatedVcdBuffer final : public VerilatedTraceBuffer { + // Give the trace file access to the private bits + friend VerilatedVcd; + friend VerilatedVcd::Super; + +#ifdef VL_TRACE_PARALLEL + char* m_writep; // Write pointer into m_bufp + char* m_bufp; // The beginning of the trace buffer + size_t m_size; // The size of the buffer at m_bufp + char* m_growp; // Resize limit pointer +#else + char* m_writep = m_owner.m_writep; // Write pointer into output buffer + char* const m_wrFlushp = m_owner.m_wrFlushp; // Output buffer flush trigger location +#endif + + // VCD line end string codes + metadata + const char* const m_suffixes = m_owner.m_suffixes.data(); + // The maximum number of bytes a single signal can emit + const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes; + + void finishLine(uint32_t code, char* writep); + +#ifdef VL_TRACE_PARALLEL + void adjustGrowp() { + m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes); + assert(m_growp >= m_bufp + m_maxSignalBytes); + } +#endif + +public: + // CONSTRUCTOR +#ifdef VL_TRACE_PARALLEL + explicit VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size); +#else + explicit VerilatedVcdBuffer(VerilatedVcd& owner); +#endif + ~VerilatedVcdBuffer() = default; + + //========================================================================= + // Implementation of VerilatedTraceBuffer interface + + // Implementations of duck-typed methods for VerilatedTraceBuffer. These are + // called from only one place (the full* methods), so always inline them. + VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval); + VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits); + VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits); + VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits); + VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits); + VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits); + VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval); +}; + +//============================================================================= +// VerilatedFile +/// Class representing a file to write to. These virtual methods can be +/// overrode for e.g. socket I/O. + +class VerilatedVcdFile VL_NOT_FINAL { +private: + int m_fd = 0; // File descriptor we're writing to +public: + // METHODS + /// Construct a (as yet) closed file + VerilatedVcdFile() = default; + /// Close and destruct + virtual ~VerilatedVcdFile() = default; + /// Open a file with given filename + virtual bool open(const std::string& name) VL_MT_UNSAFE; + /// Close object's file + virtual void close() VL_MT_UNSAFE; + /// Write data to file (if it is open) + virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE; +}; + //============================================================================= // VerilatedVcdC /// Class representing a VCD dump file in C standalone (no SystemC) diff --git a/include/verilatedos.h b/include/verilatedos.h index 28412cac4..6bacfe27b 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -40,6 +40,7 @@ #ifdef __GNUC__ # define VL_ATTR_ALIGNED(alignment) __attribute__((aligned(alignment))) # define VL_ATTR_ALWINLINE __attribute__((always_inline)) +# define VL_ATTR_NOINLINE __attribute__((noinline)) # define VL_ATTR_COLD __attribute__((cold)) # define VL_ATTR_HOT __attribute__((hot)) # define VL_ATTR_NORETURN __attribute__((noreturn)) @@ -82,6 +83,9 @@ #ifndef VL_ATTR_ALWINLINE # define VL_ATTR_ALWINLINE ///< Attribute to inline, even when not optimizing #endif +#ifndef VL_ATTR_NOINLINE +# define VL_ATTR_NOINLINE ///< Attribute to never inline, even when optimizing +#endif #ifndef VL_ATTR_COLD # define VL_ATTR_COLD ///< Attribute that function is rarely executed #endif diff --git a/src/V3EmitCImp.cpp b/src/V3EmitCImp.cpp index 0d979b143..c88648d3f 100644 --- a/src/V3EmitCImp.cpp +++ b/src/V3EmitCImp.cpp @@ -751,26 +751,26 @@ class EmitCTrace final : EmitCFunc { const string func = nodep->full() ? "full" : "chg"; bool emitWidth = true; if (nodep->dtypep()->basicp()->isDouble()) { - puts("tracep->" + func + "Double"); + puts("bufp->" + func + "Double"); emitWidth = false; } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) { - puts("tracep->" + func + "WData"); + puts("bufp->" + func + "WData"); } else if (nodep->isQuad()) { - puts("tracep->" + func + "QData"); + puts("bufp->" + func + "QData"); } else if (nodep->declp()->widthMin() > 16) { - puts("tracep->" + func + "IData"); + puts("bufp->" + func + "IData"); } else if (nodep->declp()->widthMin() > 8) { - puts("tracep->" + func + "SData"); + puts("bufp->" + func + "SData"); } else if (nodep->declp()->widthMin() > 1) { - puts("tracep->" + func + "CData"); + puts("bufp->" + func + "CData"); } else { - puts("tracep->" + func + "Bit"); + puts("bufp->" + func + "Bit"); emitWidth = false; } const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords()); const uint32_t code = nodep->declp()->code() + offset; - puts(v3Global.opt.useTraceOffloadThread() && !nodep->full() ? "(base+" : "(oldp+"); + puts(v3Global.opt.useTraceOffload() && !nodep->full() ? "(base+" : "(oldp+"); puts(cvtToStr(code - nodep->baseCode())); puts(","); emitTraceValue(nodep, arrayindex); diff --git a/src/V3EmitCMake.cpp b/src/V3EmitCMake.cpp index 67e8a741c..7df71dfeb 100644 --- a/src/V3EmitCMake.cpp +++ b/src/V3EmitCMake.cpp @@ -113,9 +113,8 @@ class CMakeEmitter final { cmake_set_raw(*of, name + "_COVERAGE", v3Global.opt.coverage() ? "1" : "0"); *of << "# Threaded output mode? 0/1/N threads (from --threads)\n"; cmake_set_raw(*of, name + "_THREADS", cvtToStr(v3Global.opt.threads())); - *of << "# Threaded tracing output mode? 0/1/N threads (from --trace-threads)\n"; - cmake_set_raw(*of, name + "_TRACE_THREADS", - cvtToStr(v3Global.opt.useTraceOffloadThread())); + *of << "# Threaded tracing output mode? 0/1/N threads (from --threads/--trace-threads)\n"; + cmake_set_raw(*of, name + "_TRACE_THREADS", cvtToStr(v3Global.opt.vmTraceThreads())); cmake_set_raw(*of, name + "_TRACE_FST_WRITER_THREAD", v3Global.opt.traceThreads() && v3Global.opt.traceFormat().fst() ? "1" : "0"); *of << "# Struct output mode? 0/1 (from --trace-structs)\n"; diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp index 429b78d33..b748d9553 100644 --- a/src/V3EmitMk.cpp +++ b/src/V3EmitMk.cpp @@ -73,9 +73,10 @@ public: of.puts("VM_TRACE_FST = "); of.puts(v3Global.opt.trace() && v3Global.opt.traceFormat().fst() ? "1" : "0"); of.puts("\n"); - of.puts("# Tracing threaded output mode? 0/1/N threads (from --trace-thread)\n"); + of.puts( + "# Tracing threaded output mode? 0/1/N threads (from --threads/--trace-thread)\n"); of.puts("VM_TRACE_THREADS = "); - of.puts(cvtToStr(v3Global.opt.useTraceOffloadThread())); + of.puts(cvtToStr(v3Global.opt.vmTraceThreads())); of.puts("\n"); of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n"); of.puts("VM_TRACE_FST_WRITER_THREAD = "); diff --git a/src/V3Options.cpp b/src/V3Options.cpp index 1b74f1062..93d23eb5e 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -775,8 +775,16 @@ void V3Options::notify() { && !v3Global.opt.xmlOnly()); } - // --trace-threads implies --threads 1 unless explicitly specified - if (traceThreads() && !threads()) m_threads = 1; + if (trace()) { + // With --trace-fst, --trace-threads implies --threads 1 unless explicitly specified + if (traceFormat().fst() && traceThreads() && !threads()) m_threads = 1; + + // With --trace, --trace-threads is ignored + if (traceFormat().vcd()) m_traceThreads = threads() ? 1 : 0; + } + + UASSERT(!(useTraceParallel() && useTraceOffload()), + "Cannot use both parallel and offloaded tracing"); // Default split limits if not specified if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit; @@ -1350,7 +1358,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char DECL_OPTION("-trace-threads", CbVal, [this, fl](const char* valp) { m_trace = true; m_traceThreads = std::atoi(valp); - if (m_traceThreads < 0) fl->v3fatal("--trace-threads must be >= 0: " << valp); + if (m_traceThreads < 1) fl->v3fatal("--trace-threads must be >= 1: " << valp); }); DECL_OPTION("-trace-underscore", OnOff, &m_traceUnderscore); diff --git a/src/V3Options.h b/src/V3Options.h index 35a71ed31..b9b5ef8ff 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -518,8 +518,10 @@ public: int traceMaxArray() const { return m_traceMaxArray; } int traceMaxWidth() const { return m_traceMaxWidth; } int traceThreads() const { return m_traceThreads; } - bool useTraceOffloadThread() const { - return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst(); + bool useTraceOffload() const { return trace() && traceFormat().fst() && traceThreads() > 1; } + bool useTraceParallel() const { return trace() && traceFormat().vcd() && threads() > 1; } + unsigned vmTraceThreads() const { + return useTraceParallel() ? threads() : useTraceOffload() ? 1 : 0; } int unrollCount() const { return m_unrollCount; } int unrollStmts() const { return m_unrollStmts; } diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp index 61d009b6f..9fa1b099a 100644 --- a/src/V3Trace.cpp +++ b/src/V3Trace.cpp @@ -180,6 +180,10 @@ private: TraceActivityVertex* const m_alwaysVtxp; // "Always trace" vertex bool m_finding = false; // Pass one of algorithm? + // Trace parallelism. Only VCD tracing can be parallelized at this time. + const uint32_t m_parallelism + = v3Global.opt.useTraceParallel() ? static_cast(v3Global.opt.threads()) : 1; + VDouble0 m_statUniqSigs; // Statistic tracking VDouble0 m_statUniqCodes; // Statistic tracking @@ -388,7 +392,7 @@ private: if (!it->second->duplicatep()) { uint32_t cost = 0; const AstTraceDecl* const declp = it->second->nodep(); - // The number of comparisons required by tracep->chg* + // The number of comparisons required by bufp->chg* cost += declp->isWide() ? declp->codeInc() : 1; // Arrays are traced by element cost *= declp->arrayRange().ranged() ? declp->arrayRange().elements() : 1; @@ -494,7 +498,7 @@ private: }; if (isTopFunc) { // Top functions - funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "* tracep"); + funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "::Buffer* bufp"); addInitStr(voidSelfAssign(m_topModp)); addInitStr(symClassAssign()); // Add global activity check to change dump functions @@ -508,32 +512,33 @@ private: m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true)); } m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp)); - m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf);\n", true)); + const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"}; + m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true)); } else { // Sub functions - funcp->argTypes(v3Global.opt.traceClassBase() + "* tracep"); + funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp"); // Setup base references. Note in rare occasions we can end up with an empty trace // sub function, hence the VL_ATTR_UNUSED attributes. if (full) { // Full dump sub function addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = " - "tracep->oldp(vlSymsp->__Vm_baseCode);\n"); + "bufp->oldp(vlSymsp->__Vm_baseCode);\n"); } else { // Change dump sub function - if (v3Global.opt.useTraceOffloadThread()) { + if (v3Global.opt.useTraceOffload()) { addInitStr("const uint32_t base VL_ATTR_UNUSED = " "vlSymsp->__Vm_baseCode + " + cvtToStr(baseCode) + ";\n"); - addInitStr("if (false && tracep) {} // Prevent unused\n"); + addInitStr("if (false && bufp) {} // Prevent unused\n"); } else { addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = " - "tracep->oldp(vlSymsp->__Vm_baseCode + " + "bufp->oldp(vlSymsp->__Vm_baseCode + " + cvtToStr(baseCode) + ");\n"); } } // Add call to top function AstCCall* const callp = new AstCCall(funcp->fileline(), funcp); - callp->argTypes("tracep"); + callp->argTypes("bufp"); topFuncp->addStmtsp(callp); } // Done @@ -728,7 +733,7 @@ private: // We will split functions such that each have to dump roughly the same amount of data // for this we need to keep tack of the number of codes used by the trace functions. uint32_t nFullCodes = 0; // Number of non-duplicate codes (need to go into full* dump) - uint32_t nChgCodes = 0; // Number of non-consant codes (need to go in to chg* dump) + uint32_t nChgCodes = 0; // Number of non-constant codes (need to go in to chg* dump) sortTraces(traces, nFullCodes, nChgCodes); UINFO(5, "nFullCodes: " << nFullCodes << " nChgCodes: " << nChgCodes << endl); @@ -747,13 +752,11 @@ private: m_regFuncp->isLoose(true); m_topScopep->addActivep(m_regFuncp); - const int parallelism = 1; // Note: will bump this later, code below works for any value - // Create the full dump functions, also allocates signal numbers - createFullTraceFunction(traces, nFullCodes, parallelism); + createFullTraceFunction(traces, nFullCodes, m_parallelism); // Create the incremental dump functions - createChgTraceFunctions(traces, nChgCodes, parallelism); + createChgTraceFunctions(traces, nChgCodes, m_parallelism); // Remove refs to traced values from TraceDecl nodes, these have now moved under // TraceInc diff --git a/test_regress/driver.pl b/test_regress/driver.pl index ffcfac4a8..fbae94f92 100755 --- a/test_regress/driver.pl +++ b/test_regress/driver.pl @@ -924,7 +924,6 @@ sub compile_vlt_flags { unshift @verilator_flags, "--trace" if $opt_trace; my $threads = ::calc_threads($Vltmt_threads); unshift @verilator_flags, "--threads $threads" if $param{vltmt} && $checkflags !~ /-threads /; - unshift @verilator_flags, "--trace-threads 1" if $param{vltmt} && $checkflags =~ /-trace /; unshift @verilator_flags, "--trace-threads 2" if $param{vltmt} && $checkflags =~ /-trace-fst /; unshift @verilator_flags, "--debug-partition" if $param{vltmt}; unshift @verilator_flags, "-CFLAGS -ggdb -LDFLAGS -ggdb" if $opt_gdbsim;