diff --git a/include/verilated_trace.h b/include/verilated_trace.h index ee0bca3e2..bb0bce25a 100644 --- a/include/verilated_trace.h +++ b/include/verilated_trace.h @@ -81,36 +81,26 @@ public: } }; -// Commands used by thread tracing. Note that the bottom 8 bits of all these -// values are empty and are used to store small amounts of additional command -// parameters. Anonymous enum in class, as we want it scoped, but we also -// want the automatic conversion to integer types. +// Commands used by thread tracing. Anonymous enum in class, as we want +// it scoped, but we also want the automatic conversion to integer types. class VerilatedTraceCommand { public: + // These must all fit in 4 bit at the moment, as the tracing routines + // pack parameters in the top bits. enum { - CHG_BIT = 0x0000, - CHG_BUS = 0x0100, - CHG_QUAD = 0x0200, - CHG_ARRAY = 0x0300, - CHG_FLOAT = 0x0400, - CHG_DOUBLE = 0x0500, - TIME_CHANGE = 0x8000, - END = 0xf000, // End of buffer - SHUTDOWN = 0xf200 // Shutdown worker thread, also marks end of buffer + CHG_BIT_0 = 0x0, + CHG_BIT_1 = 0x1, + CHG_BUS = 0x2, + CHG_QUAD = 0x3, + CHG_ARRAY = 0x4, + CHG_FLOAT = 0x5, + CHG_DOUBLE = 0x6, + // TODO: full.. + TIME_CHANGE = 0xd, + END = 0xe, // End of buffer + SHUTDOWN = 0xf // Shutdown worker thread, also marks end of buffer }; }; - -typedef union { - vluint32_t cmd; // Command code + params in bottom 8 bits - vluint32_t* oldp; // Pointer to previous value buffer to consult/update - // Note: These are 64-bit wide, as this union already has a pointer type in it. - vluint64_t params; // Command parameter - // New signal value in various forms - vluint64_t newBits; - float newFloat; - double newDouble; - vluint64_t timeui; -} VerilatedTraceEntry; #endif class VerilatedTraceCallInfo; @@ -149,18 +139,18 @@ private: size_t m_traceBufferSize; // Buffers handed to worker for processing - VerilatedThreadQueue m_buffersToWorker; + VerilatedThreadQueue m_buffersToWorker; // Buffers returned from worker after processing - VerilatedThreadQueue m_buffersFromWorker; + VerilatedThreadQueue m_buffersFromWorker; // Get a new trace buffer that can be populated. May block if none available - VerilatedTraceEntry* getTraceBuffer(); + vluint32_t* getTraceBuffer(); // Write pointer into current buffer - VerilatedTraceEntry* m_traceBufferWritep; + vluint32_t* m_traceBufferWritep; // End of trace buffer - VerilatedTraceEntry* m_traceBufferEndp; + vluint32_t* m_traceBufferEndp; // The worker thread itself std::unique_ptr m_workerThread; @@ -169,7 +159,7 @@ private: void workerThreadMain(); // Wait until given buffer is placed in m_buffersFromWorker - void waitForBuffer(const VerilatedTraceEntry* bufferp); + void waitForBuffer(const vluint32_t* bufferp); // Shut down and join worker, if it's running, otherwise do nothing void shutdownWorker(); @@ -280,48 +270,47 @@ public: #ifdef VL_TRACE_THREADED // Threaded tracing. Just dump everything in the trace buffer - inline void chgBit(vluint32_t* oldp, vluint32_t newval) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BIT | newval; - m_traceBufferWritep[1].oldp = oldp; + inline void chgBit(vluint32_t code, vluint32_t newval) { + m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_BIT_0 | newval; + m_traceBufferWritep[1] = code; m_traceBufferWritep += 2; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgBus(vluint32_t* oldp, vluint32_t newval, int bits) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_BUS | bits; - m_traceBufferWritep[1].oldp = oldp; - m_traceBufferWritep[2].newBits = newval; + inline void chgBus(vluint32_t code, vluint32_t newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_BUS; + m_traceBufferWritep[1] = code; + m_traceBufferWritep[2] = newval; m_traceBufferWritep += 3; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgQuad(vluint32_t* oldp, vluint64_t newval, int bits) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_QUAD | bits; - m_traceBufferWritep[1].oldp = oldp; - m_traceBufferWritep[2].newBits = newval; + inline void chgQuad(vluint32_t code, vluint64_t newval, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_QUAD; + m_traceBufferWritep[1] = code; + *reinterpret_cast(m_traceBufferWritep + 2) = newval; + m_traceBufferWritep += 4; + VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); + } + inline void chgArray(vluint32_t code, const vluint32_t* newvalp, int bits) { + m_traceBufferWritep[0] = (bits << 4) | VerilatedTraceCommand::CHG_ARRAY; + m_traceBufferWritep[1] = code; + m_traceBufferWritep += 2; + for (int i = 0; i < (bits + 31) / 32; ++i) { *m_traceBufferWritep++ = newvalp[i]; } + VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); + } + inline void chgFloat(vluint32_t code, float newval) { + m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_FLOAT; + m_traceBufferWritep[1] = code; + // cppcheck-suppress invalidPointerCast + *reinterpret_cast(m_traceBufferWritep + 2) = newval; m_traceBufferWritep += 3; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } - inline void chgArray(vluint32_t* oldp, const vluint32_t* newvalp, int bits) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_ARRAY; - m_traceBufferWritep[1].oldp = oldp; - m_traceBufferWritep[2].params = bits; - m_traceBufferWritep += 3; - vluint32_t* const wp = reinterpret_cast(m_traceBufferWritep); - for (int i = 0; i < (bits + 31) / 32; ++i) { wp[i] = newvalp[i]; } - m_traceBufferWritep += (bits + 63) / 64; - VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); - } - inline void chgFloat(vluint32_t* oldp, float newval) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_FLOAT; - m_traceBufferWritep[1].oldp = oldp; - m_traceBufferWritep[2].newFloat = newval; - m_traceBufferWritep += 3; - VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); - } - inline void chgDouble(vluint32_t* oldp, double newval) { - m_traceBufferWritep[0].cmd = VerilatedTraceCommand::CHG_DOUBLE; - m_traceBufferWritep[1].oldp = oldp; - m_traceBufferWritep[2].newDouble = newval; - m_traceBufferWritep += 3; + inline void chgDouble(vluint32_t code, double newval) { + m_traceBufferWritep[0] = VerilatedTraceCommand::CHG_DOUBLE; + m_traceBufferWritep[1] = code; + // cppcheck-suppress invalidPointerCast + *reinterpret_cast(m_traceBufferWritep + 2) = newval; + m_traceBufferWritep += 4; VL_DEBUG_IF(assert(m_traceBufferWritep <= m_traceBufferEndp);); } diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.cpp index 3dc3c33d4..2e5345d65 100644 --- a/include/verilated_trace_imp.cpp +++ b/include/verilated_trace_imp.cpp @@ -101,8 +101,8 @@ public: // This is in .cpp file so is not widely visible //========================================================================= // Buffer management -template <> VerilatedTraceEntry* VerilatedTrace::getTraceBuffer() { - VerilatedTraceEntry* bufferp; +template <> vluint32_t* VerilatedTrace::getTraceBuffer() { + vluint32_t* bufferp; // Some jitter is expected, so some number of alternative trace buffers are // required, but don't allocate more than 8 buffers. if (m_numTraceBuffers < 8) { @@ -111,7 +111,7 @@ template <> VerilatedTraceEntry* VerilatedTrace::getTraceBuffer() ++m_numTraceBuffers; // Note: over allocate a bit so pointer comparison is well defined // if we overflow only by a small amount - bufferp = new VerilatedTraceEntry[m_traceBufferSize + 16]; + bufferp = new vluint32_t[m_traceBufferSize + 16]; } } else { // Block until a buffer becomes available @@ -120,10 +120,10 @@ template <> VerilatedTraceEntry* VerilatedTrace::getTraceBuffer() return bufferp; } -template <> void VerilatedTrace::waitForBuffer(const VerilatedTraceEntry* buffp) { +template <> void VerilatedTrace::waitForBuffer(const vluint32_t* buffp) { // Slow path code only called on flush/shutdown, so use a simple algorithm. // Collect buffers from worker and stash them until we get the one we want. - std::deque stash; + std::deque stash; do { stash.push_back(m_buffersFromWorker.get()); } while (stash.back() != buffp); // Now put them back in the queue, in the original order. while (!stash.empty()) { @@ -136,65 +136,71 @@ template <> void VerilatedTrace::waitForBuffer(const VerilatedTrac // Worker thread template <> void VerilatedTrace::workerThreadMain() { - while (true) { - VerilatedTraceEntry* const bufferp = m_buffersToWorker.get(); + bool shutdown = false; + + do { + vluint32_t* const bufferp = m_buffersToWorker.get(); VL_TRACE_THREAD_DEBUG(""); - VL_TRACE_THREAD_DEBUG("Got buffer"); + VL_TRACE_THREAD_DEBUG("Got buffer: " << bufferp); - const VerilatedTraceEntry* readp = bufferp; - - vluint32_t cmd; - unsigned bits; - vluint32_t* oldp; - vluint64_t newBits; + const vluint32_t* readp = bufferp; while (true) { - cmd = (readp++)->cmd; + const vluint32_t cmd = readp[0]; + const vluint32_t top = cmd >> 4; + // Always set this up, as it is almost always needed + vluint32_t* const oldp = m_sigs_oldvalp + readp[1]; + // Note this increment needs to be undone on commands which do not + // actually contain a code, but those are the rare cases. + readp += 2; - switch (cmd & ~0xFFU) { + switch (cmd & 0xF) { //=== // CHG_* commands - case VerilatedTraceCommand::CHG_BIT: - VL_TRACE_THREAD_DEBUG("Command CHG_BIT"); - chgBitImpl((readp++)->oldp, cmd & 1); + case VerilatedTraceCommand::CHG_BIT_0: + VL_TRACE_THREAD_DEBUG("Command CHG_BIT_0 " << top); + chgBitImpl(oldp, 0); + continue; + case VerilatedTraceCommand::CHG_BIT_1: + VL_TRACE_THREAD_DEBUG("Command CHG_BIT_1 " << top); + chgBitImpl(oldp, 1); continue; case VerilatedTraceCommand::CHG_BUS: - VL_TRACE_THREAD_DEBUG("Command CHG_BUS"); + VL_TRACE_THREAD_DEBUG("Command CHG_BUS " << top); // Bits stored in bottom byte of command - chgBusImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL); - readp += 2; - VL_TRACE_THREAD_DEBUG("Command CHG_BUS DONE"); + chgBusImpl(oldp, *readp, top); + readp += 1; continue; case VerilatedTraceCommand::CHG_QUAD: - VL_TRACE_THREAD_DEBUG("Command CHG_QUAD"); + VL_TRACE_THREAD_DEBUG("Command CHG_QUAD " << top); // Bits stored in bottom byte of command - chgQuadImpl(readp[0].oldp, readp[1].newBits, cmd & 0xFFULL); + chgQuadImpl(oldp, *reinterpret_cast(readp), top); readp += 2; continue; case VerilatedTraceCommand::CHG_ARRAY: - VL_TRACE_THREAD_DEBUG("Command CHG_CHG_ARRAY"); - oldp = (readp++)->oldp; - bits = (readp++)->params; - chgArrayImpl(oldp, reinterpret_cast(readp), bits); - readp += (bits + 63) / 64; + VL_TRACE_THREAD_DEBUG("Command CHG_ARRAY " << top); + chgArrayImpl(oldp, readp, top); + readp += (top + 31) / 32; continue; case VerilatedTraceCommand::CHG_FLOAT: - VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT"); - chgFloatImpl(readp[0].oldp, readp[1].newFloat); - readp += 2; + VL_TRACE_THREAD_DEBUG("Command CHG_FLOAT " << top); + chgFloatImpl(oldp, *reinterpret_cast(readp)); + readp += 1; continue; case VerilatedTraceCommand::CHG_DOUBLE: - VL_TRACE_THREAD_DEBUG("Command CHG_DOUBLE"); - chgDoubleImpl(readp[0].oldp, readp[1].newDouble); + VL_TRACE_THREAD_DEBUG("Command CHG_DOUBLE " << top); + chgDoubleImpl(oldp, *reinterpret_cast(readp)); readp += 2; continue; //=== // Rare commands case VerilatedTraceCommand::TIME_CHANGE: - VL_TRACE_THREAD_DEBUG("Command TIME_CHANGE"); - emitTimeChange((readp++)->timeui); + VL_TRACE_THREAD_DEBUG("Command TIME_CHANGE " << top); + readp -= 1; // No code in this command, undo increment + emitTimeChange(*reinterpret_cast(readp)); + readp += 2; continue; //=== @@ -202,11 +208,13 @@ template <> void VerilatedTrace::workerThreadMain() { case VerilatedTraceCommand::END: VL_TRACE_THREAD_DEBUG("Command END"); break; case VerilatedTraceCommand::SHUTDOWN: VL_TRACE_THREAD_DEBUG("Command SHUTDOWN"); + shutdown = true; break; //=== // Unknown command default: + VL_TRACE_THREAD_DEBUG("Command UNKNOWN"); VL_PRINTF_MT("Trace command: 0x%08x\n", cmd); VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command"); break; @@ -221,10 +229,7 @@ template <> void VerilatedTrace::workerThreadMain() { // Return buffer m_buffersFromWorker.put(bufferp); - - // Shut down if required - if (VL_UNLIKELY(cmd == VerilatedTraceCommand::SHUTDOWN)) return; - } + } while (VL_LIKELY(!shutdown)); } template <> void VerilatedTrace::shutdownWorker() { @@ -232,8 +237,8 @@ template <> void VerilatedTrace::shutdownWorker() { if (!m_workerThread) return; // Hand an buffer with a shutdown command to the worker thread - VerilatedTraceEntry* const bufferp = getTraceBuffer(); - bufferp[0].cmd = VerilatedTraceCommand::SHUTDOWN; + vluint32_t* const bufferp = getTraceBuffer(); + bufferp[0] = VerilatedTraceCommand::SHUTDOWN; m_buffersToWorker.put(bufferp); // Wait for it to return waitForBuffer(bufferp); @@ -260,8 +265,8 @@ template <> void VerilatedTrace::close() { template <> void VerilatedTrace::flush() { #ifdef VL_TRACE_THREADED // Hand an empty buffer to the worker thread - VerilatedTraceEntry* const bufferp = getTraceBuffer(); - bufferp[0].cmd = VerilatedTraceCommand::END; + vluint32_t* const bufferp = getTraceBuffer(); + *bufferp = VerilatedTraceCommand::END; m_buffersToWorker.put(bufferp); // Wait for it to be returned. As the processing is in-order, // this ensures all previous buffers have been processed. @@ -333,10 +338,10 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { #ifdef VL_TRACE_THREADED // Compute trace buffer size. we need to be able to store a new value for // each signal, which is 'nextCode()' entries after the init callbacks - // above have been run, plus up to 3 more words of metadata per signal, - // plus fixed overhead of 1 for a termination flag and 2 for a time stamp + // above have been run, plus up to 2 more words of metadata per signal, + // plus fixed overhead of 1 for a termination flag and 3 for a time stamp // update. - m_traceBufferSize = nextCode() + numSignals() * 3 + 3; + m_traceBufferSize = nextCode() + numSignals() * 2 + 4; // Start the worker thread m_workerThread.reset(new std::thread(&VerilatedTrace::workerThreadMain, this)); @@ -406,18 +411,21 @@ template <> void VerilatedTrace::dump(vluint64_t timeui) { } #ifdef VL_TRACE_THREADED - // Get the trace buffer we are about to fill - VerilatedTraceEntry* const bufferp = getTraceBuffer(); - m_traceBufferWritep = bufferp; - m_traceBufferEndp = bufferp + m_traceBufferSize; - // Currently only incremental dumps run on the worker thread + vluint32_t* bufferp = nullptr; if (VL_LIKELY(!m_fullDump)) { + // Get the trace buffer we are about to fill + bufferp = getTraceBuffer(); + m_traceBufferWritep = bufferp; + m_traceBufferEndp = bufferp + m_traceBufferSize; + // Tell worker to update time point - (m_traceBufferWritep++)->cmd = VerilatedTraceCommand::TIME_CHANGE; - (m_traceBufferWritep++)->timeui = timeui; + m_traceBufferWritep[0] = VerilatedTraceCommand::TIME_CHANGE; + *reinterpret_cast(m_traceBufferWritep + 1) = timeui; + m_traceBufferWritep += 3; } else { // Update time point + flush(); emitTimeChange(timeui); } #else @@ -440,14 +448,16 @@ template <> void VerilatedTrace::dump(vluint64_t timeui) { } #ifdef VL_TRACE_THREADED - // Mark end of the trace buffer we just filled - (m_traceBufferWritep++)->cmd = VerilatedTraceCommand::END; + if (VL_LIKELY(bufferp)) { + // Mark end of the trace buffer we just filled + *m_traceBufferWritep++ = VerilatedTraceCommand::END; - // Assert no buffer overflow - assert(m_traceBufferWritep - bufferp <= m_traceBufferSize); + // Assert no buffer overflow + assert(m_traceBufferWritep - bufferp <= m_traceBufferSize); - // Pass it to the worker thread - m_buffersToWorker.put(bufferp); + // Pass it to the worker thread + m_buffersToWorker.put(bufferp); + } #endif } diff --git a/src/V3EmitC.cpp b/src/V3EmitC.cpp index ee4ae3e3d..061e0b1b3 100644 --- a/src/V3EmitC.cpp +++ b/src/V3EmitC.cpp @@ -3547,29 +3547,30 @@ class EmitCTrace : EmitCStmts { void emitTraceChangeOne(AstTraceInc* nodep, int arrayindex) { iterateAndNextNull(nodep->precondsp()); - string full = ((m_funcp->funcType() == AstCFuncType::TRACE_FULL - || m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB) - ? "full" - : "chg"); + const bool full = (m_funcp->funcType() == AstCFuncType::TRACE_FULL + || m_funcp->funcType() == AstCFuncType::TRACE_FULL_SUB); + const string func = full ? "full" : "chg"; bool emitWidth = false; if (nodep->dtypep()->basicp()->isDouble()) { - puts("vcdp->" + full + "Double"); + puts("vcdp->" + func + "Double"); } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) { - puts("vcdp->" + full + "Array"); + puts("vcdp->" + func + "Array"); emitWidth = true; } else if (nodep->isQuad()) { - puts("vcdp->" + full + "Quad"); + puts("vcdp->" + func + "Quad"); emitWidth = true; } else if (nodep->declp()->widthMin() > 1) { - puts("vcdp->" + full + "Bus"); + puts("vcdp->" + func + "Bus"); emitWidth = true; } else { - puts("vcdp->" + full + "Bit"); + puts("vcdp->" + func + "Bit"); } const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords()); const uint32_t code = nodep->declp()->code() + offset; - puts("(oldp+" + cvtToStr(code - m_baseCode) + ","); + puts(v3Global.opt.trueTraceThreads() && !full ? "(base+" : "(oldp+"); + puts(cvtToStr(code - m_baseCode)); + puts(","); emitTraceValue(nodep, arrayindex); if (emitWidth) puts("," + cvtToStr(nodep->declp()->widthMin())); puts(");\n"); @@ -3648,8 +3649,14 @@ class EmitCTrace : EmitCStmts { nodep->stmtsp()->v3fatalSrc("Trace sub function should contain AstTraceInc"); } m_baseCode = stmtp->declp()->code(); - puts("vluint32_t* oldp = vcdp->oldp(code+" + cvtToStr(m_baseCode) + ");\n"); - puts("if (false && vcdp && oldp) {} // Prevent unused\n"); + if (v3Global.opt.trueTraceThreads() + && nodep->funcType() == AstCFuncType::TRACE_CHANGE_SUB) { + puts("vluint32_t base = code+" + cvtToStr(m_baseCode) + ";\n"); + puts("if (false && vcdp && base) {} // Prevent unused\n"); + } else { + puts("vluint32_t* oldp = vcdp->oldp(code+" + cvtToStr(m_baseCode) + ");\n"); + puts("if (false && vcdp && oldp) {} // Prevent unused\n"); + } } else if (nodep->funcType() == AstCFuncType::TRACE_INIT_SUB) { puts("int c = code;\n"); puts("if (false && vcdp && c) {} // Prevent unused\n"); diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp index af8be2aa9..19816a465 100644 --- a/src/V3EmitMk.cpp +++ b/src/V3EmitMk.cpp @@ -69,9 +69,7 @@ public: of.puts("\n"); of.puts("# Tracing threaded output mode? 0/1/N threads (from --trace-thread)\n"); of.puts("VM_TRACE_THREADS = "); - of.puts(!v3Global.opt.traceThreads() - ? "0" - : cvtToStr(v3Global.opt.traceThreads() - v3Global.opt.traceFormat().fst())); + of.puts(cvtToStr(v3Global.opt.trueTraceThreads())); of.puts("\n"); of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n"); of.puts("VM_TRACE_FST_WRITER_THREAD = "); diff --git a/src/V3Options.h b/src/V3Options.h index 6e4941ba8..28082d8a9 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -490,6 +490,9 @@ public: int traceMaxArray() const { return m_traceMaxArray; } int traceMaxWidth() const { return m_traceMaxWidth; } int traceThreads() const { return m_traceThreads; } + bool trueTraceThreads() const { + return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst(); + } int unrollCount() const { return m_unrollCount; } int unrollStmts() const { return m_unrollStmts; }