From a4ed3c20864b5161926239edd060a14ccd198c70 Mon Sep 17 00:00:00 2001 From: Geza Lore Date: Tue, 19 Jul 2022 17:06:26 +0100 Subject: [PATCH] Make parallel tracing switchable at run-time --- include/verilated_trace.h | 10 ++- include/verilated_trace_imp.h | 80 +++++++++---------- include/verilated_vcd_c.cpp | 140 ++++++++++++++++++---------------- include/verilated_vcd_c.h | 24 +++--- 4 files changed, 138 insertions(+), 116 deletions(-) diff --git a/include/verilated_trace.h b/include/verilated_trace.h index 895cecd26..e5e80904f 100644 --- a/include/verilated_trace.h +++ b/include/verilated_trace.h @@ -182,7 +182,7 @@ private: const bool m_offload; // Whether to use the offload thread (ignored if !VL_THREADED) -#ifdef VL_TRACE_PARALLEL +#ifdef VL_THREADED struct ParallelWorkerData { const dumpCb_t m_cb; // The callback void* const m_userp; // The use pointer to pass to the callback @@ -317,6 +317,14 @@ protected: static constexpr bool offload() { return false; } #endif + inline bool parallel() const { +#ifdef VL_TRACE_PARALLEL + return true; +#else + return false; +#endif + } + //========================================================================= // Virtual functions to be provided by the format specific implementation diff --git a/include/verilated_trace_imp.h b/include/verilated_trace_imp.h index c3eebc2dd..aed3a09a4 100644 --- a/include/verilated_trace_imp.h +++ b/include/verilated_trace_imp.h @@ -26,7 +26,7 @@ #include "verilated_intrinsics.h" #include "verilated_trace.h" -#ifdef VL_TRACE_PARALLEL +#ifdef VL_THREADED # include "verilated_threads.h" # include #endif @@ -462,7 +462,7 @@ void VerilatedTrace::dumpvars(int level, const std::string& } } -#ifdef VL_TRACE_PARALLEL +#ifdef VL_THREADED template <> // void VerilatedTrace::parallelWorkerTask(void* datap, bool) { ParallelWorkerData* const wdp = reinterpret_cast(datap); @@ -490,45 +490,47 @@ template <> VL_ATTR_NOINLINE void VerilatedTrace::ParallelWo template <> void VerilatedTrace::runCallbacks(const std::vector& cbVec) { -#ifdef VL_TRACE_PARALLEL - // If tracing in parallel, dispatch to the thread pool - VlThreadPool* threadPoolp = static_cast(m_contextp->threadPoolp()); - // List of work items for thread (std::list, as ParallelWorkerData is not movable) - std::list workerData; - // We use the whole pool + the main thread - const unsigned threads = threadPoolp->numThreads() + 1; - // Main thread executes all jobs with index % threads == 0 - std::vector mainThreadWorkerData; - // Enuque all the jobs - for (unsigned i = 0; i < cbVec.size(); ++i) { - const CallbackRecord& cbr = cbVec[i]; - // Always get the trace buffer on the main thread - Buffer* const bufp = getTraceBuffer(); - // Create new work item - workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp); - // Grab the new work item - ParallelWorkerData* const itemp = &workerData.back(); - // Enqueue task to thread pool, or main thread - if (unsigned rem = i % threads) { - threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp); - } else { - mainThreadWorkerData.push_back(itemp); +#ifdef VL_THREADED + if (parallel()) { + // If tracing in parallel, dispatch to the thread pool + VlThreadPool* threadPoolp = static_cast(m_contextp->threadPoolp()); + // List of work items for thread (std::list, as ParallelWorkerData is not movable) + std::list workerData; + // We use the whole pool + the main thread + const unsigned threads = threadPoolp->numThreads() + 1; + // Main thread executes all jobs with index % threads == 0 + std::vector mainThreadWorkerData; + // Enuque all the jobs + for (unsigned i = 0; i < cbVec.size(); ++i) { + const CallbackRecord& cbr = cbVec[i]; + // Always get the trace buffer on the main thread + Buffer* const bufp = getTraceBuffer(); + // Create new work item + workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp); + // Grab the new work item + ParallelWorkerData* const itemp = &workerData.back(); + // Enqueue task to thread pool, or main thread + if (unsigned rem = i % threads) { + threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp); + } else { + mainThreadWorkerData.push_back(itemp); + } + } + // Execute main thead jobs + for (ParallelWorkerData* const itemp : mainThreadWorkerData) { + parallelWorkerTask(itemp, false); + } + // Commit all trace buffers in order + for (ParallelWorkerData& item : workerData) { + // Wait until ready + item.wait(); + // Commit the buffer + commitTraceBuffer(item.m_bufp); } - } - // Execute main thead jobs - for (ParallelWorkerData* const itemp : mainThreadWorkerData) { - parallelWorkerTask(itemp, false); - } - // Commit all trace buffers in order - for (ParallelWorkerData& item : workerData) { - // Wait until ready - item.wait(); - // Commit the buffer - commitTraceBuffer(item.m_bufp); - } - // Done - return; + // Done + return; + } #endif // Fall back on sequential execution for (const CallbackRecord& cbr : cbVec) { diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp index ee3a43583..e5ce780cf 100644 --- a/include/verilated_vcd_c.cpp +++ b/include/verilated_vcd_c.cpp @@ -230,9 +230,11 @@ VerilatedVcd::~VerilatedVcd() { if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr); deleteNameMap(); if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr); -#ifdef VL_TRACE_PARALLEL - assert(m_numBuffers == m_freeBuffers.size()); - for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr); +#ifdef VL_THREADED + if (parallel()) { + assert(m_numBuffers == m_freeBuffers.size()); + for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr); + } #endif } @@ -572,49 +574,55 @@ void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int a VerilatedVcd::Buffer* VerilatedVcd::getTraceBuffer() { VerilatedVcd::Buffer* const bufp = new Buffer{*this}; -#ifdef VL_TRACE_PARALLEL - // Note: This is called from VeriltedVcd::dump, which already holds the lock - // If no buffer available, allocate a new one - if (m_freeBuffers.empty()) { - constexpr size_t pageSize = 4096; - // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety - size_t startingSize = roundUpToMultipleOf(4 * m_maxSignalBytes); - m_freeBuffers.emplace_back(new char[startingSize], startingSize); - ++m_numBuffers; +#ifdef VL_THREADED + if (parallel()) { + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // If no buffer available, allocate a new one + if (m_freeBuffers.empty()) { + constexpr size_t pageSize = 4096; + // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety + size_t startingSize = roundUpToMultipleOf(4 * m_maxSignalBytes); + m_freeBuffers.emplace_back(new char[startingSize], startingSize); + ++m_numBuffers; + } + // Grab a buffer + const auto pair = m_freeBuffers.back(); + m_freeBuffers.pop_back(); + // Initialize + bufp->m_writep = bufp->m_bufp = pair.first; + bufp->m_size = pair.second; + bufp->adjustGrowp(); } - // Grab a buffer - const auto pair = m_freeBuffers.back(); - m_freeBuffers.pop_back(); - // Initialize - bufp->m_writep = bufp->m_bufp = pair.first; - bufp->m_size = pair.second; - bufp->adjustGrowp(); #endif // Return the buffer return bufp; } void VerilatedVcd::commitTraceBuffer(VerilatedVcd::Buffer* bufp) { -#ifdef VL_TRACE_PARALLEL - // Note: This is called from VeriltedVcd::dump, which already holds the lock - // Resize output buffer. Note, we use the full size of the trace buffer, as - // this is a lot more stable than the actual occupancy of the trace buffer. - // This helps us to avoid re-allocations due to small size changes. - bufferResize(bufp->m_size); - // Compute occupancy of buffer - const size_t usedSize = bufp->m_writep - bufp->m_bufp; - // Copy to output buffer - std::memcpy(m_writep, bufp->m_bufp, usedSize); - // Adjust write pointer - m_writep += usedSize; - // Flush if necessary - bufferCheck(); - // Put buffer back on free list - m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size); + if (parallel()) { +#if VL_THREADED + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // Resize output buffer. Note, we use the full size of the trace buffer, as + // this is a lot more stable than the actual occupancy of the trace buffer. + // This helps us to avoid re-allocations due to small size changes. + bufferResize(bufp->m_size); + // Compute occupancy of buffer + const size_t usedSize = bufp->m_writep - bufp->m_bufp; + // Copy to output buffer + std::memcpy(m_writep, bufp->m_bufp, usedSize); + // Adjust write pointer + m_writep += usedSize; + // Flush if necessary + bufferCheck(); + // Put buffer back on free list + m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size); #else - // Needs adjusting for emitTimeChange - m_writep = bufp->m_writep; + VL_FATAL_MT(__FILE__, __LINE__, "", "Unreachable"); #endif + } else { + // Needs adjusting for emitTimeChange + m_writep = bufp->m_writep; + } delete bufp; } @@ -656,35 +664,39 @@ void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) { // suffix, which was stored in the last byte of the suffix buffer entry. m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1]; -#ifdef VL_TRACE_PARALLEL - // Double the size of the buffer if necessary - if (VL_UNLIKELY(m_writep >= m_growp)) { - // Compute occupied size of current buffer - const size_t usedSize = m_writep - m_bufp; - // We are always doubling the size - m_size *= 2; - // Allocate the new buffer - char* const newBufp = new char[m_size]; - // Copy from current buffer to new buffer - std::memcpy(newBufp, m_bufp, usedSize); - // Delete current buffer - delete[] m_bufp; - // Make new buffer the current buffer - m_bufp = newBufp; - // Adjust write pointer - m_writep = m_bufp + usedSize; - // Adjust resize limit - adjustGrowp(); - } + if (m_owner.parallel()) { +#ifdef VL_THREADED + // Double the size of the buffer if necessary + if (VL_UNLIKELY(m_writep >= m_growp)) { + // Compute occupied size of current buffer + const size_t usedSize = m_writep - m_bufp; + // We are always doubling the size + m_size *= 2; + // Allocate the new buffer + char* const newBufp = new char[m_size]; + // Copy from current buffer to new buffer + std::memcpy(newBufp, m_bufp, usedSize); + // Delete current buffer + delete[] m_bufp; + // Make new buffer the current buffer + m_bufp = newBufp; + // Adjust write pointer + m_writep = m_bufp + usedSize; + // Adjust resize limit + adjustGrowp(); + } #else - // Flush the write buffer if there's not enough space left for new information - // We only call this once per vector, so we need enough slop for a very wide "b###" line - if (VL_UNLIKELY(m_writep > m_wrFlushp)) { - m_owner.m_writep = m_writep; - m_owner.bufferFlush(); - m_writep = m_owner.m_writep; - } + VL_FATAL_MT(__FILE__, __LINE__, "", "Unreachable"); #endif + } else { + // Flush the write buffer if there's not enough space left for new information + // We only call this once per vector, so we need enough slop for a very wide "b###" line + if (VL_UNLIKELY(m_writep > m_wrFlushp)) { + m_owner.m_writep = m_writep; + m_owner.bufferFlush(); + m_writep = m_owner.m_writep; + } + } } //============================================================================= diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h index bba0f4253..853e976e7 100644 --- a/include/verilated_vcd_c.h +++ b/include/verilated_vcd_c.h @@ -65,7 +65,7 @@ private: using NameMap = std::map; NameMap* m_namemapp = nullptr; // List of names for the header -#ifdef VL_TRACE_PARALLEL +#ifdef VL_THREADED // Vector of free trace buffers as (pointer, size) pairs. std::vector> m_freeBuffers; size_t m_numBuffers = 0; // Number of trace buffers allocated @@ -168,30 +168,30 @@ class VerilatedVcdBuffer VL_NOT_FINAL { VerilatedVcd& m_owner; // Trace file owning this buffer. Required by subclasses. -#ifdef VL_TRACE_PARALLEL - char* m_writep = nullptr; // Write pointer into m_bufp - char* m_bufp = nullptr; // The beginning of the trace buffer - size_t m_size = 0; // The size of the buffer at m_bufp - char* m_growp = nullptr; // Resize limit pointer -#else - char* m_writep = m_owner.m_writep; // Write pointer into output buffer - char* const m_wrFlushp = m_owner.m_wrFlushp; // Output buffer flush trigger location -#endif + // Write pointer into output buffer (in parallel mode, this is set up in 'getTraceBuffer') + char* m_writep = m_owner.parallel() ? nullptr : m_owner.m_writep; + // Output buffer flush trigger location (only used when not parallel) + char* const m_wrFlushp = m_owner.parallel() ? nullptr : m_owner.m_wrFlushp; // VCD line end string codes + metadata const char* const m_suffixes = m_owner.m_suffixes.data(); // The maximum number of bytes a single signal can emit const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes; - void finishLine(uint32_t code, char* writep); +#ifdef VL_THREADED + // Additional data for parallel tracing only + char* m_bufp = nullptr; // The beginning of the trace buffer + size_t m_size = 0; // The size of the buffer at m_bufp + char* m_growp = nullptr; // Resize limit pointer -#ifdef VL_TRACE_PARALLEL void adjustGrowp() { m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes); assert(m_growp >= m_bufp + m_maxSignalBytes); } #endif + void finishLine(uint32_t code, char* writep); + // CONSTRUCTOR explicit VerilatedVcdBuffer(VerilatedVcd& owner) : m_owner{owner} {}