diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 47b5f70b2..87310899f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,7 +29,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, ubuntu-18.04] + os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04] compiler: - { cc: clang, cxx: clang++ } - { cc: gcc, cxx: g++ } @@ -37,9 +37,11 @@ jobs: exclude: # Build pull requests only with ubuntu-20.04 and without m32 - os: ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }} + - os: ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }} - m32: ${{ github.event_name == 'pull_request' && 1 || 'do-not-exclude' }} # Build -m32 only on ubuntu-20.04 - {os: ubuntu-18.04, m32: 1} + - {os: ubuntu-22.04, m32: 1} include: # Build GCC 10 on ubuntu-20.04 - os: ubuntu-20.04 @@ -95,7 +97,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, ubuntu-18.04] + os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04] compiler: - { cc: clang, cxx: clang++ } - { cc: gcc, cxx: g++ } @@ -104,9 +106,11 @@ jobs: exclude: # Build pull requests only with ubuntu-20.04 and without m32 - os: ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }} + - os: ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }} - m32: ${{ github.event_name == 'pull_request' && 1 || 'do-not-exclude' }} # Build -m32 only on ubuntu-20.04 - {os: ubuntu-18.04, m32: 1} + - {os: ubuntu-22.04, m32: 1} include: # Test with GCC 10 on ubuntu-20.04 without m32 - {os: ubuntu-20.04, compiler: { cc: gcc-10, cxx: g++-10 }, m32: 0, suite: dist-vlt-0} @@ -122,7 +126,7 @@ jobs: CI_M32: ${{ matrix.m32 }} CC: ${{ matrix.compiler.cc }} CXX: ${{ matrix.compiler.cxx }} - CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${ matrix.suite }} + CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${{ matrix.suite }} CCACHE_MAXSIZE: 64M # Per build matrix entry (2160M in total) VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz steps: diff --git a/Changes b/Changes index e2a165c88..69928d82d 100644 --- a/Changes +++ b/Changes @@ -22,12 +22,20 @@ Verilator 5.001 devel Verilator 4.223 devel ========================== +**Major:** + +* VCD tracing is now parallelized with --threads (#3449). [Geza Lore, Shunyao CAD] + **Minor:** +* Add -f options to replace -O options (#3436). +* Changed --no-merge-const-pool to -fno-merge-const-pool (#3436). * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD] -* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick] -* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD] * Support non-ANSI interface port declarations (#3439). [Geza Lore, Shunyao CAD] +* Support concat assignment to packed array (#3446). +* Improve conditional merging optimization (#3125). [Geza Lore, Shunyao CAD] +* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD] +* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick] * Fix hang with large case statement optimization (#3405). [Mike Urbach] * Fix 'with' operator with type casting (#3387). [xiak95] * Fix incorrect conditional merging (#3409). [Raynard Qiao] diff --git a/bin/verilator b/bin/verilator index d936ebf56..7d27dad5d 100755 --- a/bin/verilator +++ b/bin/verilator @@ -319,6 +319,7 @@ detailed descriptions of these arguments. -f Parse arguments from a file -FI Force include of a file --flatten Force inlining of all modules, tasks and functions + -fno- Disable internal optimization stage -G= Overwrite top-level parameter --gdb Run Verilator under GDB interactively --gdbbt Run Verilator under GDB for backtrace @@ -344,7 +345,6 @@ detailed descriptions of these arguments. --MMD Create .d dependency files --MP Create phony dependency targets --Mdir Name of output object directory - --no-merge-const-pool Disable merging of different types in const pool --mod-prefix Name to prepend to lower classes --no-clk Prevent marking specified signal as clock --no-decoration Disable comments and symbol decorations @@ -404,7 +404,7 @@ detailed descriptions of these arguments. --trace-max-width Maximum array depth for tracing --trace-params Enable tracing of parameters --trace-structs Enable tracing structure names - --trace-threads Enable waveform creation on separate threads + --trace-threads Enable FST waveform creation on separate threads --trace-underscore Enable tracing of _signals -U Undefine preprocessor define --unroll-count Tune maximum loop iterations diff --git a/ci/ci-install.bash b/ci/ci-install.bash index f258916b4..4f61f06c4 100755 --- a/ci/ci-install.bash +++ b/ci/ci-install.bash @@ -54,8 +54,12 @@ if [ "$CI_BUILD_STAGE_NAME" = "build" ]; then if [ "$CI_OS_NAME" = "linux" ]; then sudo apt-get update - sudo apt-get install libfl-dev libgoogle-perftools-dev ccache - if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then + sudo apt-get install libfl-dev ccache + if [ "$CI_RUNS_ON" != "ubuntu-22.04" ]; then + # Some conflict of libunwind verison on 22.04, can live without it for now + sudo apt-get install libgoogle-perftools-dev + fi + if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then sudo apt-get install libsystemc libsystemc-dev fi if [ "$COVERAGE" = 1 ]; then @@ -85,7 +89,7 @@ elif [ "$CI_BUILD_STAGE_NAME" = "test" ]; then sudo apt-get update # libfl-dev needed for internal coverage's test runs sudo apt-get install gdb gtkwave lcov libfl-dev ccache - if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then + if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then sudo apt-get install libsystemc-dev fi if [ "$CI_M32" = 1 ]; then diff --git a/configure.ac b/configure.ac index 2a2b99924..20fade5f0 100644 --- a/configure.ac +++ b/configure.ac @@ -348,14 +348,18 @@ AC_SUBST(CFG_CXXFLAGS_PROFILE) # Flag to select newest language standard supported # Macros work such that first option that passes is the one we take -# Currently enabled c++14 due to packaged SystemC dependency -# c++14 is the newest that Verilator is regressed to support +# Currently enable c++17/c++14 due to packaged SystemC dependency +# c++17 is the newest that Verilator is regularly tested to support # c++11 is the oldest that Verilator supports # gnu is requried for Cygwin to compile verilated.h successfully #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++20) #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++20) -#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17) -#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17) +case "$(which lsb_release 2>&1 > /dev/null && lsb_release -d)" in +*Ubuntu*22.04*) +_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17) +_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17) +;; +esac _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++14) _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++14) _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++11) diff --git a/docs/CONTRIBUTORS b/docs/CONTRIBUTORS index 8079639e2..d598cebd5 100644 --- a/docs/CONTRIBUTORS +++ b/docs/CONTRIBUTORS @@ -35,6 +35,7 @@ Guokai Chen Harald Heckmann Howard Su Huang Rui +Huanghuang Zhou HungMingWu HyungKi Jeong Iru Cai diff --git a/docs/guide/deprecations.rst b/docs/guide/deprecations.rst index 33c2ef610..8c0038453 100644 --- a/docs/guide/deprecations.rst +++ b/docs/guide/deprecations.rst @@ -20,6 +20,11 @@ Option `--cdc` The experimental `--cdc` option is believed to be generally unused and is planned for removal no sooner than January 2023. +Option `-O` + The debug `-O` options have been replaced with + `-fno-` debug options to match GCC. The old options are + planned for removal no sooner than June 2023. + Option `--prof-threads` The `--prof-threads` option has been superseded by the `--prof-exec` and `--prof-pgo` options and is planned for removal no sooner than April 2023. diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst index 7a8e791f5..8cc56fa89 100644 --- a/docs/guide/exe_verilator.rst +++ b/docs/guide/exe_verilator.rst @@ -428,6 +428,52 @@ Summary: flattening large designs may require significant CPU time, memory and storage. +.. option:: -fno-acyc-simp + +.. option:: -fno-assemble + +.. option:: -fno-case + +.. option:: -fno-combine + +.. option:: -fno-const + +.. option:: -fno-const-bit-op-tree + +.. option:: -fno-dedup + +.. option:: -fno-expand + +.. option:: -fno-gate + +.. option:: -fno-inline + +.. option:: -fno-life + +.. option:: -fno-life-post + +.. option:: -fno-localize + +.. option:: -fno-merge-cond + +.. option:: -fno-merge-const-pool + +.. option:: -fno-reloop + +.. option:: -fno-reorder + +.. option:: -fno-split + +.. option:: -fno-subst + +.. option:: -fno-subst-const + +.. option:: -fno-table + + Rarely needed. Disables one of the internal optimization steps. These + are typically used only when recommended by a maintainer to help debug + or work around an issue. + .. option:: -G= Overwrites the given parameter of the toplevel module. The value is @@ -645,13 +691,6 @@ Summary: The directory is created if it does not exist and the parent directories exist; otherwise manually create the Mdir before calling Verilator. -.. option:: --no-merge-const-pool - - Rarely needed. In order to minimize cache footprint, values of different - data type, that are yet emitted identically in C++ are merged in the - constant pool. This option disables this and causes every constant pool - entry with a distinct data type to be emitted separately. - .. option:: --mod-prefix Specifies the name to prepend to all lower level classes. Defaults to @@ -700,9 +739,9 @@ Summary: Rarely needed. Enables or disables a specific optimizations, with the optimization selected based on the letter passed. A lowercase letter - disables an optimization, an upper case letter enables it. This is - intended for debugging use only; see the source code for - version-dependent mappings of optimizations to -O letters. + disables an optimization, an upper case letter enables it. This option + is deprecated and the various `-f` arguments should be + used instead. .. option:: -o @@ -1042,7 +1081,8 @@ Summary: is not thread safe. With "--threads 1", the generated model is single threaded but may run in a multithreaded environment. With "--threads N", where N >= 2, the model is generated to run multithreaded on up to N - threads. See :ref:`Multithreading`. + threads. See :ref:`Multithreading`. This option also applies to + :vlopt:`--trace` (but not :vlopt:`--trace-fst`). .. option:: --threads-dpi all @@ -1120,7 +1160,8 @@ Summary: Having tracing compiled in may result in some small performance losses, even when tracing is not turned on during model execution. - See also :vlopt:`--trace-threads` option. + When using :vlopt:`--threads`, VCD tracing is parallelized, using the + same number of threads as passed to :vlopt:`--threads`. .. option:: --trace-coverage @@ -1174,12 +1215,12 @@ Summary: .. option:: --trace-threads *threads* Enable waveform tracing using separate threads. This is typically faster - in simulation runtime but uses more total compute. This option is - independent of, and works with, both :vlopt:`--trace` and - :vlopt:`--trace-fst`. Different trace formats can take advantage of - more trace threads to varying degrees. Currently VCD tracing can utilize - at most "--trace-threads 1", and FST tracing can utilize at most - "--trace-threads 2". This overrides :vlopt:`--no-threads` . + in simulation runtime but uses more total compute. This option only + applies to :vlopt:`--trace-fst`. FST tracing can utilize at most + "--trace-threads 2". This overrides :vlopt:`--no-threads`. + + This option is accepted, but has absolutely no effect with + :vlopt:`--trace`, which respects :vlopt:`--threads` instead. .. option:: --trace-underscore diff --git a/docs/guide/faq.rst b/docs/guide/faq.rst index 5cc4acd43..0b70ea289 100644 --- a/docs/guide/faq.rst +++ b/docs/guide/faq.rst @@ -72,23 +72,38 @@ a good thing for getting working silicon. Will Verilator output remain under my own license/copyright? """""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Yes, it's just like using GCC on your programs; this is why Verilator uses -the "GNU **Lesser** Public License Version 3" instead of the more typical -"GNU Public License". See the licenses for details, but in brief, if you -change Verilator itself or the header files Verilator includes, you must -make the source code available under the GNU Lesser Public License. -However, Verilator output (the Verilated code) only "include"s the licensed -files, and so you are **not** required to open-source release any output -from Verilator. +Your SystemVerilog, VPI/DPI, or main() C++ code remains under your own license. + +It's just like how using GCC on your programs does not change the copyright +of your program; this is why Verilator uses the "GNU **Lesser** Public +License Version 3" instead of the more typical "GNU Public License". See +the licenses for details. + +Some examples: + +* Any SystemVerilog or other input fed into Verilator remain your own. + +* Any of your VPI/DPI C++ routines that Verilator calls remain your own. + +* Any of your main() C++ code that calls into Verilator remain your own. + +* If you change Verilator itself, for example changing or adding a file + under the src/ directory in the repository, you must make the source code + available under the GNU Lesser Public License. + +* If you change a header Verilator provides, for example under include/ in + the repository, you must make the source code available under the GNU + Lesser Public License. You also have the option of using the Perl Artistic License, which again -does not require you to release your Verilog or generated code, and also -allows you to modify Verilator for internal use without distributing the -modified version. But please contribute back to the community! +does not require you to release your Verilog, C++, or generated code. This +license also allows you to modify Verilator for internal use without +distributing the modified version. But please contribute back to the +community! -One limit is that you cannot under either license release a closed-source -Verilog simulation product incorporating Verilator. That is you can have a -commercial product, but must make the source code available. +Under both license you can offer a commercial product that is based on +Verilator either directly or embedded within. However under both licenses, +any changes you make to Verilator for such a product must be open sourced. As is standard with Open Source, contributions back to Verilator will be placed under the Verilator copyright and LGPL/Artistic license. Small test diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst index f443ca298..2af18c1f0 100644 --- a/docs/guide/verilating.rst +++ b/docs/guide/verilating.rst @@ -221,9 +221,13 @@ model, it may be beneficial to performance to adjust the influences the partitioning of the model by adjusting the assumed execution time of DPI imports. -The :vlopt:`--trace-threads` options can be used to produce trace dumps -using multiple threads. If :vlopt:`--trace-threads` is set without -:vlopt:`--threads`, then :vlopt:`--trace-threads` will imply +When using :vlopt:`--trace` to perform VCD tracing, the VCD trace +construction is parallelized using the same number of threads as specified +with :vlopt:`--threads`, and is executed on the same thread pool as the model. + +The :vlopt:`--trace-threads` options can be used with :vlopt:`--trace-fst` +to offload FST tracing using multiple threads. If :vlopt:`--trace-threads` is +given without :vlopt:`--threads`, then :vlopt:`--trace-threads` will imply :vlopt:`--threads 1 <--threads>`, i.e.: the support libraries will be thread safe. @@ -231,12 +235,12 @@ With :vlopt:`--trace-threads 0 <--trace-threads>`, trace dumps are produced on the main thread. This again gives the highest single thread performance. With :vlopt:`--trace-threads {N} <--trace-threads>`, where N is at least 1, -N additional threads will be created and managed by the trace files (e.g.: -VerilatedVcdC or VerilatedFstC), to generate the trace dump. The main -thread will be released to proceed with execution as soon as possible, -though some blocking of the main thread is still necessary while capturing -the trace. Different trace formats can utilize a various number of -threads. See the :vlopt:`--trace-threads` option. +up to N additional threads will be created and managed by the trace files +(e.g.: VerilatedFstC), to offload construction of the trace dump. The main +thread will be released to proceed with execution as soon as possible, though +some blocking of the main thread is still necessary while capturing the +trace. FST tracing can utilize up to 2 offload threads, so there is no use +of setting :vlopt:`--trace-threads` higher than 2 at the moment. When running a multithreaded model, the default Linux task scheduler often works against the model, by assuming threads are short lived, and thus @@ -441,7 +445,7 @@ SystemC include directories and link to the SystemC libraries. .. describe:: TRACE_THREADS - Optional. Generated multi-threaded trace dumping, same as + Optional. Generated multi-threaded FST trace dumping, same as "--trace-threads". .. describe:: TOP_MODULE diff --git a/docs/internals.rst b/docs/internals.rst index cf6b05d1e..104f18503 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -595,7 +595,7 @@ path through the graph is the sum of macro-task execution costs. Sarkar does almost the same thing, except that he has nonzero estimates for synchronization costs. -Verilator's cost estimates are assigned by ``InstrCountCostVisitor``. This +Verilator's cost estimates are assigned by ``InstrCountVisitor``. This class is perhaps the most fragile piece of the multithread implementation. It's easy to have a bug where you count something cheap (eg. accessing one element of a huge array) as if it were expensive (eg. diff --git a/docs/spelling.txt b/docs/spelling.txt index 9014a6af6..0e423ba26 100644 --- a/docs/spelling.txt +++ b/docs/spelling.txt @@ -683,6 +683,7 @@ onehot ooo oprofile oversubscription +parallelized param parameterized params @@ -771,6 +772,7 @@ specparam splitme spp sqrt +src srcdir srcfile sscanf @@ -889,6 +891,7 @@ writeme writemem writememb writememh +xiak xin xml xnor diff --git a/examples/cmake_tracing_c/CMakeLists.txt b/examples/cmake_tracing_c/CMakeLists.txt index 522c20cc5..95fb3dfb2 100644 --- a/examples/cmake_tracing_c/CMakeLists.txt +++ b/examples/cmake_tracing_c/CMakeLists.txt @@ -33,5 +33,5 @@ add_executable(example ../make_tracing_c/sim_main.cpp) # Add the Verilated circuit to the target verilate(example COVERAGE TRACE INCLUDE_DIRS "../make_tracing_c" - VERILATOR_ARGS -f ../make_tracing_c/input.vc -Os -x-assign 0 + VERILATOR_ARGS -f ../make_tracing_c/input.vc -x-assign fast SOURCES ../make_tracing_c/top.v) diff --git a/examples/cmake_tracing_sc/CMakeLists.txt b/examples/cmake_tracing_sc/CMakeLists.txt index 4651d1709..0d67a8cf5 100644 --- a/examples/cmake_tracing_sc/CMakeLists.txt +++ b/examples/cmake_tracing_sc/CMakeLists.txt @@ -45,7 +45,7 @@ set_property( # Add the Verilated circuit to the target verilate(example SYSTEMC COVERAGE TRACE INCLUDE_DIRS "../make_tracing_sc" - VERILATOR_ARGS -f ../make_tracing_sc/input.vc -Os -x-assign 0 + VERILATOR_ARGS -f ../make_tracing_sc/input.vc -x-assign fast SOURCES ../make_tracing_sc/top.v) verilator_link_systemc(example) diff --git a/examples/make_protect_lib/Makefile b/examples/make_protect_lib/Makefile index 215df0396..359ece33e 100644 --- a/examples/make_protect_lib/Makefile +++ b/examples/make_protect_lib/Makefile @@ -33,7 +33,7 @@ VERILATOR_FLAGS = # Generate C++ VERILATOR_FLAGS += -cc # Optimize -VERILATOR_FLAGS += -Os -x-assign 0 +VERILATOR_FLAGS += -x-assign fast # Warn abount lint issues; may not want this on less solid designs VERILATOR_FLAGS += -Wall # This example does not use vl_time_stamp but rather diff --git a/examples/make_tracing_c/Makefile b/examples/make_tracing_c/Makefile index be77c71e4..e7dcaf244 100644 --- a/examples/make_tracing_c/Makefile +++ b/examples/make_tracing_c/Makefile @@ -36,7 +36,7 @@ VERILATOR_FLAGS += -cc --exe # Generate makefile dependencies (not shown as complicates the Makefile) #VERILATOR_FLAGS += -MMD # Optimize -VERILATOR_FLAGS += -Os -x-assign 0 +VERILATOR_FLAGS += -x-assign fast # Warn abount lint issues; may not want this on less solid designs VERILATOR_FLAGS += -Wall # Make waveforms diff --git a/examples/make_tracing_sc/Makefile b/examples/make_tracing_sc/Makefile index 80a6221b2..5f90a5ebf 100644 --- a/examples/make_tracing_sc/Makefile +++ b/examples/make_tracing_sc/Makefile @@ -37,7 +37,7 @@ VERILATOR_FLAGS += -sc --exe # Generate makefile dependencies (not shown as complicates the Makefile) #VERILATOR_FLAGS += -MMD # Optimize -VERILATOR_FLAGS += -Os -x-assign 0 +VERILATOR_FLAGS += -x-assign fast # Warn abount lint issues; may not want this on less solid designs VERILATOR_FLAGS += -Wall # Make waveforms diff --git a/include/verilated.h b/include/verilated.h index 804d7363a..f9cf79601 100644 --- a/include/verilated.h +++ b/include/verilated.h @@ -147,7 +147,7 @@ extern uint32_t VL_THREAD_ID() VL_MT_SAFE; #if VL_THREADED -#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before relaxing +#define VL_LOCK_SPINS 50000 /// Number of times to spin for a mutex before yielding /// Mutex, wrapped to allow -fthread_safety checks class VL_CAPABILITY("mutex") VerilatedMutex final { diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp index 68431db71..0bc1048cf 100644 --- a/include/verilated_fst_c.cpp +++ b/include/verilated_fst_c.cpp @@ -83,9 +83,11 @@ static_assert(static_cast(FST_ST_VCD_PROGRAM) == static_cast(VLT_TRACE //============================================================================= // Specialization of the generics for this trace format -#define VL_DERIVED_T VerilatedFst -#include "verilated_trace_imp.cpp" -#undef VL_DERIVED_T +#define VL_SUB_T VerilatedFst +#define VL_BUF_T VerilatedFstBuffer +#include "verilated_trace_imp.h" +#undef VL_SUB_T +#undef VL_BUF_T //============================================================================= // VerilatedFst @@ -111,7 +113,7 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) { m_curScope.clear(); - VerilatedTrace::traceInit(); + Super::traceInit(); // Clear the scope stack auto it = m_curScope.begin(); @@ -133,14 +135,14 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) { void VerilatedFst::close() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::closeBase(); + Super::closeBase(); fstWriterClose(m_fst); m_fst = nullptr; } void VerilatedFst::flush() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::flushBase(); + Super::flushBase(); fstWriterFlushContext(m_fst); } @@ -162,7 +164,7 @@ void VerilatedFst::declare(uint32_t code, const char* name, int dtypenum, fstVar int lsb) { const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1; - const bool enabled = VerilatedTrace::declCode(code, name, bits, false); + const bool enabled = Super::declCode(code, name, bits, false); if (!enabled) return; std::string nameasstr = namePrefix() + name; @@ -245,18 +247,42 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst declare(code, name, dtypenum, vardir, vartype, array, arraynum, false, 63, 0); } +//============================================================================= +// Get/commit trace buffer + +VerilatedFstBuffer* VerilatedFst::getTraceBuffer() { return new VerilatedFstBuffer{*this}; } + +void VerilatedFst::commitTraceBuffer(VerilatedFstBuffer* bufp) { +#ifdef VL_TRACE_OFFLOAD + if (bufp->m_offloadBufferWritep) { + m_offloadBufferWritep = bufp->m_offloadBufferWritep; + return; // Buffer will be deleted by the offload thread + } +#endif + delete bufp; +} + +//============================================================================= +// VerilatedFstBuffer implementation + +VerilatedFstBuffer::VerilatedFstBuffer(VerilatedFst& owner) + : VerilatedTraceBuffer{owner} {} + +//============================================================================= +// Trace rendering primitives + // Note: emit* are only ever called from one place (full* in -// verilated_trace_imp.cpp, which is included in this file at the top), +// verilated_trace_imp.h, which is included in this file at the top), // so always inline them. VL_ATTR_ALWINLINE -void VerilatedFst::emitBit(uint32_t code, CData newval) { +void VerilatedFstBuffer::emitBit(uint32_t code, CData newval) { VL_DEBUG_IFDEF(assert(m_symbolp[code]);); fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0"); } VL_ATTR_ALWINLINE -void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) { +void VerilatedFstBuffer::emitCData(uint32_t code, CData newval, int bits) { char buf[VL_BYTESIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits)); @@ -264,7 +290,7 @@ void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) { +void VerilatedFstBuffer::emitSData(uint32_t code, SData newval, int bits) { char buf[VL_SHORTSIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits)); @@ -272,7 +298,7 @@ void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) { +void VerilatedFstBuffer::emitIData(uint32_t code, IData newval, int bits) { char buf[VL_IDATASIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits)); @@ -280,7 +306,7 @@ void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) { +void VerilatedFstBuffer::emitQData(uint32_t code, QData newval, int bits) { char buf[VL_QUADSIZE]; VL_DEBUG_IFDEF(assert(m_symbolp[code]);); cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits)); @@ -288,7 +314,7 @@ void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) { +void VerilatedFstBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) { int words = VL_WORDS_I(bits); char* wp = m_strbuf; // Convert the most significant word @@ -304,6 +330,6 @@ void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) { } VL_ATTR_ALWINLINE -void VerilatedFst::emitDouble(uint32_t code, double newval) { +void VerilatedFstBuffer::emitDouble(uint32_t code, double newval) { fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval); } diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h index b622a1894..5131cc8cc 100644 --- a/include/verilated_fst_c.h +++ b/include/verilated_fst_c.h @@ -31,15 +31,19 @@ #include #include +class VerilatedFstBuffer; + //============================================================================= // VerilatedFst // Base class to create a Verilator FST dump // This is an internally used class - see VerilatedFstC for what to call from applications -class VerilatedFst final : public VerilatedTrace { +class VerilatedFst final : public VerilatedTrace { +public: + using Super = VerilatedTrace; + private: - // Give the superclass access to private bits (to avoid virtual functions) - friend class VerilatedTrace; + friend Buffer; // Give the buffer access to the private bits //========================================================================= // FST specific internals @@ -60,31 +64,26 @@ protected: //========================================================================= // Implementation of VerilatedTrace interface - // Implementations of protected virtual methods for VerilatedTrace + // Called when the trace moves forward to a new time point virtual void emitTimeChange(uint64_t timeui) override; // Hooks called from VerilatedTrace virtual bool preFullDump() override { return isOpen(); } virtual bool preChangeDump() override { return isOpen(); } - // Implementations of duck-typed methods for VerilatedTrace. These are - // called from only one place (namely full*) so always inline them. - inline void emitBit(uint32_t code, CData newval); - inline void emitCData(uint32_t code, CData newval, int bits); - inline void emitSData(uint32_t code, SData newval, int bits); - inline void emitIData(uint32_t code, IData newval, int bits); - inline void emitQData(uint32_t code, QData newval, int bits); - inline void emitWData(uint32_t code, const WData* newvalp, int bits); - inline void emitDouble(uint32_t code, double newval); + // Trace buffer management + virtual VerilatedFstBuffer* getTraceBuffer() override; + virtual void commitTraceBuffer(VerilatedFstBuffer*) override; public: //========================================================================= // External interface to client code - // (All must be threadsafe) + // CONSTRUCTOR explicit VerilatedFst(void* fst = nullptr); ~VerilatedFst(); + // METHODS - All must be thread safe // Open the file; call isOpen() to see if errors void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex); // Close the file @@ -97,11 +96,6 @@ public: //========================================================================= // Internal interface to Verilator generated code - // Inside dumping routines, declare a data type - void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits, - const char** itemNamesp, const char** itemValuesp); - - // Inside dumping routines, declare a signal void declBit(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, fstVarType vartype, bool array, int arraynum); void declBus(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, @@ -112,18 +106,55 @@ public: fstVarType vartype, bool array, int arraynum, int msb, int lsb); void declDouble(uint32_t code, const char* name, int dtypenum, fstVarDir vardir, fstVarType vartype, bool array, int arraynum); + + void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits, + const char** itemNamesp, const char** itemValuesp); }; #ifndef DOXYGEN // Declare specialization here as it's used in VerilatedFstC just below -template <> void VerilatedTrace::dump(uint64_t timeui); -template <> void VerilatedTrace::set_time_unit(const char* unitp); -template <> void VerilatedTrace::set_time_unit(const std::string& unit); -template <> void VerilatedTrace::set_time_resolution(const char* unitp); -template <> void VerilatedTrace::set_time_resolution(const std::string& unit); -template <> void VerilatedTrace::dumpvars(int level, const std::string& hier); +template <> void VerilatedFst::Super::dump(uint64_t time); +template <> void VerilatedFst::Super::set_time_unit(const char* unitp); +template <> void VerilatedFst::Super::set_time_unit(const std::string& unit); +template <> void VerilatedFst::Super::set_time_resolution(const char* unitp); +template <> void VerilatedFst::Super::set_time_resolution(const std::string& unit); +template <> void VerilatedFst::Super::dumpvars(int level, const std::string& hier); #endif +//============================================================================= +// VerilatedFstBuffer + +class VerilatedFstBuffer final : public VerilatedTraceBuffer { + // Give the trace file access to the private bits + friend VerilatedFst; + friend VerilatedFst::Super; + + // The FST file handle + void* const m_fst = m_owner.m_fst; + // code to fstHande map, as an array + const fstHandle* const m_symbolp = m_owner.m_symbolp; + // String buffer long enough to hold maxBits() chars + char* const m_strbuf = m_owner.m_strbuf; + +public: + // CONSTRUCTOR + explicit VerilatedFstBuffer(VerilatedFst& owner); + ~VerilatedFstBuffer() = default; + + //========================================================================= + // Implementation of VerilatedTraceBuffer interface + + // Implementations of duck-typed methods for VerilatedTraceBuffer. These are + // called from only one place (the full* methods), so always inline them. + VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval); + VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits); + VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits); + VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits); + VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits); + VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits); + VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval); +}; + //============================================================================= // VerilatedFstC /// Create a FST dump file in C standalone (no SystemC) simulations. diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp index 1a5f16a36..ed25093d1 100644 --- a/include/verilated_profiler.cpp +++ b/include/verilated_profiler.cpp @@ -60,7 +60,7 @@ uint16_t VlExecutionRecord::getcpu() { //============================================================================= // VlExecutionProfiler implementation -template size_t roundUptoMultipleOf(size_t value) { +template static size_t roundUptoMultipleOf(size_t value) { static_assert((N & (N - 1)) == 0, "'N' must be a power of 2"); size_t mask = N - 1; return (value + mask) & ~mask; diff --git a/include/verilated_trace.h b/include/verilated_trace.h index a88ce6b50..7915c3645 100644 --- a/include/verilated_trace.h +++ b/include/verilated_trace.h @@ -22,28 +22,43 @@ #ifndef VERILATOR_VERILATED_TRACE_H_ #define VERILATOR_VERILATED_TRACE_H_ -#ifdef VL_TRACE_THREADED -#define VL_TRACE_OFFLOAD +// clang-format off + +// In FST mode, VL_TRACE_THREADED enables offloading, but only if we also have +// the FST writer thread. This means with --trace-threads 1, we get the FST +// writer thread only, and with --trace-threads 2 we get offloading as well +#if defined(VL_TRACE_FST_WRITER_THREAD) && defined(VL_TRACE_THREADED) +# define VL_TRACE_OFFLOAD +#endif +// VCD tracing can happen fully in parallel +#if defined(VM_TRACE_VCD) && VM_TRACE_VCD && defined(VL_TRACE_THREADED) +# define VL_TRACE_PARALLEL #endif -// clang-format off +#if defined(VL_TRACE_PARALLEL) && defined(VL_TRACE_OFFLOAD) +# error "Cannot have VL_TRACE_PARALLEL and VL_TRACE_OFFLOAD together" +#endif #include "verilated.h" #include "verilated_trace_defs.h" #include +#include #include #include +#include #include #ifdef VL_TRACE_OFFLOAD -# include # include # include #endif // clang-format on +class VlThreadPool; +template class VerilatedTraceBuffer; + #ifdef VL_TRACE_OFFLOAD //============================================================================= // Offloaded tracing @@ -106,7 +121,8 @@ public: CHG_WDATA = 0x6, CHG_DOUBLE = 0x8, // TODO: full.. - TIME_CHANGE = 0xd, + TIME_CHANGE = 0xc, + TRACE_BUFFER = 0xd, END = 0xe, // End of buffer SHUTDOWN = 0xf // Shutdown worker thread, also marks end of buffer }; @@ -116,16 +132,22 @@ public: //============================================================================= // VerilatedTrace -// VerilatedTrace uses F-bounded polymorphism to access duck-typed -// implementations in the format specific derived class, which must be passed -// as the type parameter T_Derived -template class VerilatedTrace VL_NOT_FINAL { +// T_Trace is the format specific subclass of VerilatedTrace. +// T_Buffer is the format specific subclass of VerilatedTraceBuffer. +template class VerilatedTrace VL_NOT_FINAL { + // Give the buffer (both base and derived) access to the private bits + friend VerilatedTraceBuffer; + friend T_Buffer; + public: + using Buffer = T_Buffer; + //========================================================================= // Generic tracing internals - using initCb_t = void (*)(void*, T_Derived*, uint32_t); // Type of init callbacks - using dumpCb_t = void (*)(void*, T_Derived*); // Type of all but init callbacks + using initCb_t = void (*)(void*, T_Trace*, uint32_t); // Type of init callbacks + using dumpCb_t = void (*)(void*, Buffer*); // Type of dump callbacks + using cleanupCb_t = void (*)(void*, T_Trace*); // Type of cleanup callbacks private: struct CallbackRecord { @@ -133,9 +155,10 @@ private: // (the one in Ubuntu 14.04 with GCC 4.8.4 in particular) use the // assignment operator on inserting into collections, so they don't work // with const fields... - union { - initCb_t m_initCb; // The callback function - dumpCb_t m_dumpCb; // The callback function + union { // The callback + initCb_t m_initCb; + dumpCb_t m_dumpCb; + cleanupCb_t m_cleanupCb; }; void* m_userp; // The user pointer to pass to the callback (the symbol table) CallbackRecord(initCb_t cb, void* userp) @@ -144,32 +167,66 @@ private: CallbackRecord(dumpCb_t cb, void* userp) : m_dumpCb{cb} , m_userp{userp} {} + CallbackRecord(cleanupCb_t cb, void* userp) + : m_cleanupCb{cb} + , m_userp{userp} {} }; - uint32_t* m_sigs_oldvalp; // Old value store - EData* m_sigs_enabledp; // Bit vector of enabled codes (nullptr = all on) - uint64_t m_timeLastDump; // Last time we did a dump +#ifdef VL_TRACE_PARALLEL + struct ParallelWorkerData { + const dumpCb_t m_cb; // The callback + void* const m_userp; // The use pointer to pass to the callback + Buffer* const m_bufp; // The buffer pointer to pass to the callback + std::atomic m_ready{false}; // The ready flag + mutable VerilatedMutex m_mutex; // Mutex for suspension until ready + std::condition_variable_any m_cv; // Condition variable for suspension + bool m_waiting VL_GUARDED_BY(m_mutex) = false; // Whether a thread is suspended in wait() + + void wait(); + + ParallelWorkerData(dumpCb_t cb, void* userp, Buffer* bufp) + : m_cb{cb} + , m_userp{userp} + , m_bufp{bufp} {} + }; + + // Passed a ParallelWorkerData*, second argument is ignored + static void parallelWorkerTask(void*, bool); +#endif + + using ParallelCallbackMap = std::unordered_map>; + +protected: + uint32_t* m_sigs_oldvalp = nullptr; // Previous value store + EData* m_sigs_enabledp = nullptr; // Bit vector of enabled codes (nullptr = all on) +private: + uint64_t m_timeLastDump = 0; // Last time we did a dump std::vector m_sigs_enabledVec; // Staging for m_sigs_enabledp - std::vector m_initCbs; // Routines to initialize traciong - std::vector m_fullCbs; // Routines to perform full dump - std::vector m_chgCbs; // Routines to perform incremental dump + std::vector m_initCbs; // Routines to initialize tracing + ParallelCallbackMap m_fullCbs; // Routines to perform full dump + ParallelCallbackMap m_chgCbs; // Routines to perform incremental dump std::vector m_cleanupCbs; // Routines to call at the end of dump - bool m_fullDump; // Whether a full dump is required on the next call to 'dump' - uint32_t m_nextCode; // Next code number to assign - uint32_t m_numSignals; // Number of distinct signals - uint32_t m_maxBits; // Number of bits in the widest signal + std::vector m_threadPoolps; // All thread pools, in insertion order + bool m_fullDump = true; // Whether a full dump is required on the next call to 'dump' + uint32_t m_nextCode = 0; // Next code number to assign + uint32_t m_numSignals = 0; // Number of distinct signals + uint32_t m_maxBits = 0; // Number of bits in the widest signal std::vector m_namePrefixStack{""}; // Path prefixes to add to signal names std::vector> m_dumpvars; // dumpvar() entries - char m_scopeEscape; - double m_timeRes; // Time resolution (ns/ms etc) - double m_timeUnit; // Time units (ns/ms etc) + char m_scopeEscape = '.'; + double m_timeRes = 1e-9; // Time resolution (ns/ms etc) + double m_timeUnit = 1e-0; // Time units (ns/ms etc) + + void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex); void addCallbackRecord(std::vector& cbVec, CallbackRecord& cbRec) VL_MT_SAFE_EXCLUDES(m_mutex); - // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->' + // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->' // to access duck-typed functions to avoid a virtual function call. - T_Derived* self() { return static_cast(this); } + T_Trace* self() { return static_cast(this); } + + void runParallelCallbacks(const ParallelCallbackMap& cbMap); // Flush any remaining data for this file static void onFlush(void* selfp) VL_MT_UNSAFE_ONE; @@ -178,17 +235,21 @@ private: #ifdef VL_TRACE_OFFLOAD // Number of total offload buffers that have been allocated - uint32_t m_numOffloadBuffers; + uint32_t m_numOffloadBuffers = 0; // Size of offload buffers - size_t m_offloadBufferSize; + size_t m_offloadBufferSize = 0; // Buffers handed to worker for processing VerilatedThreadQueue m_offloadBuffersToWorker; // Buffers returned from worker after processing VerilatedThreadQueue m_offloadBuffersFromWorker; + +protected: // Write pointer into current buffer - uint32_t* m_offloadBufferWritep; + uint32_t* m_offloadBufferWritep = nullptr; // End of offload buffer - uint32_t* m_offloadBufferEndp; + uint32_t* m_offloadBufferEndp = nullptr; + +private: // The offload worker thread itself std::unique_ptr m_workerThread; @@ -250,6 +311,10 @@ protected: virtual bool preFullDump() = 0; virtual bool preChangeDump() = 0; + // Trace buffer management + virtual Buffer* getTraceBuffer() = 0; + virtual void commitTraceBuffer(Buffer*) = 0; + public: //========================================================================= // External interface to client code @@ -270,19 +335,55 @@ public: // Call void dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex); + //========================================================================= + // Internal interface to Verilator generated code + //========================================================================= // Non-hot path internal interface to Verilator generated code void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE; - void addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE; - void addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE; - void addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE; + void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE; + void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE; + void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE; void scopeEscape(char flag) { m_scopeEscape = flag; } void pushNamePrefix(const std::string&); void popNamePrefix(unsigned count = 1); +}; +//============================================================================= +// VerilatedTraceBuffer + +// T_Trace is the format specific subclass of VerilatedTrace. +// T_Buffer is the format specific subclass of VerilatedTraceBuffer. +// The format-specific hot-path methods use duck-typing via T_Buffer for performance. +template class VerilatedTraceBuffer VL_NOT_FINAL { + friend T_Trace; // Give the trace file access to the private bits + +protected: + T_Trace& m_owner; // The VerilatedTrace subclass that owns this buffer + + // Previous value store + uint32_t* const m_sigs_oldvalp = m_owner.m_sigs_oldvalp; + // Bit vector of enabled codes (nullptr = all on) + EData* const m_sigs_enabledp = m_owner.m_sigs_enabledp; + +#ifdef VL_TRACE_OFFLOAD + // Write pointer into current buffer + uint32_t* m_offloadBufferWritep = m_owner.m_offloadBufferWritep; + // End of offload buffer + uint32_t* const m_offloadBufferEndp = m_owner.m_offloadBufferEndp; +#endif + + // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->' + // to access duck-typed functions to avoid a virtual function call. + inline T_Buffer* self() { return static_cast(this); } + + explicit VerilatedTraceBuffer(T_Trace& owner); + virtual ~VerilatedTraceBuffer() = default; + +public: //========================================================================= // Hot path internal interface to Verilator generated code @@ -300,7 +401,7 @@ public: // duck-typed void emitWData(uint32_t code, const WData* newvalp, int bits) = 0; // duck-typed void emitDouble(uint32_t code, double newval) = 0; - uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; } + VL_ATTR_ALWINLINE inline uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; } // Write to previous value buffer value and emit trace entry. void fullBit(uint32_t* oldp, CData newval); @@ -363,9 +464,13 @@ public: VL_DEBUG_IF(assert(m_offloadBufferWritep <= m_offloadBufferEndp);); } -#define CHG(name) chg##name##Impl -#else -#define CHG(name) chg##name +#define chgBit chgBitImpl +#define chgCData chgCDataImpl +#define chgSData chgSDataImpl +#define chgIData chgIDataImpl +#define chgQData chgQDataImpl +#define chgWData chgWDataImpl +#define chgDouble chgDoubleImpl #endif // In non-offload mode, these are called directly by the trace callbacks, @@ -373,27 +478,27 @@ public: // thread and are called chg*Impl // Check previous dumped value of signal. If changed, then emit trace entry - inline void CHG(Bit)(uint32_t* oldp, CData newval) { + VL_ATTR_ALWINLINE inline void chgBit(uint32_t* oldp, CData newval) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullBit(oldp, newval); } - inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgCData(uint32_t* oldp, CData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits); } - inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgSData(uint32_t* oldp, SData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits); } - inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgIData(uint32_t* oldp, IData newval, int bits) { const uint32_t diff = *oldp ^ newval; if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits); } - inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) { + VL_ATTR_ALWINLINE inline void chgQData(uint32_t* oldp, QData newval, int bits) { const uint64_t diff = *reinterpret_cast(oldp) ^ newval; if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits); } - inline void CHG(WData)(uint32_t* oldp, const WData* newvalp, int bits) { + VL_ATTR_ALWINLINE inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) { for (int i = 0; i < (bits + 31) / 32; ++i) { if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) { fullWData(oldp, newvalp, bits); @@ -401,11 +506,20 @@ public: } } } - inline void CHG(Double)(uint32_t* oldp, double newval) { + VL_ATTR_ALWINLINE inline void chgDouble(uint32_t* oldp, double newval) { // cppcheck-suppress invalidPointerCast if (VL_UNLIKELY(*reinterpret_cast(oldp) != newval)) fullDouble(oldp, newval); } -#undef CHG +#ifdef VL_TRACE_OFFLOAD +#undef chgBit +#undef chgCData +#undef chgSData +#undef chgIData +#undef chgQData +#undef chgWData +#undef chgDouble +#endif }; + #endif // guard diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.h similarity index 71% rename from include/verilated_trace_imp.cpp rename to include/verilated_trace_imp.h index 7a98b7abf..d2ffa965c 100644 --- a/include/verilated_trace_imp.cpp +++ b/include/verilated_trace_imp.h @@ -10,26 +10,26 @@ // SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 // //============================================================================= -/// -/// \file -/// \brief Verilated common-format tracing implementation code -/// -/// This file must be compiled and linked against all Verilated objects -/// that use --trace. -/// -/// Use "verilator --trace" to add this to the Makefile for the linker. -/// +// +// Verilated tracing implementation code template common to all formats. +// This file is included by the format specific implementations and +// should not be used otherwise. +// //============================================================================= // clang-format off #ifndef VL_CPPCHECK -#ifndef VL_DERIVED_T +#if !defined(VL_SUB_T) || !defined(VL_BUF_T) # error "This file should be included in trace format implementations" #endif #include "verilated_intrinsics.h" #include "verilated_trace.h" +#ifdef VL_TRACE_PARALLEL +# include "verilated_threads.h" +# include +#endif #if 0 # include @@ -82,7 +82,7 @@ static std::string doubleToTimescale(double value) { //========================================================================= // Buffer management -template <> uint32_t* VerilatedTrace::getOffloadBuffer() { +template <> uint32_t* VerilatedTrace::getOffloadBuffer() { uint32_t* bufferp; // Some jitter is expected, so some number of alternative offlaod buffers are // required, but don't allocate more than 8 buffers. @@ -101,7 +101,7 @@ template <> uint32_t* VerilatedTrace::getOffloadBuffer() { return bufferp; } -template <> void VerilatedTrace::waitForOffloadBuffer(const uint32_t* buffp) { +template <> void VerilatedTrace::waitForOffloadBuffer(const uint32_t* buffp) { // Slow path code only called on flush/shutdown, so use a simple algorithm. // Collect buffers from worker and stash them until we get the one we want. std::deque stash; @@ -116,7 +116,7 @@ template <> void VerilatedTrace::waitForOffloadBuffer(const uint32 //========================================================================= // Worker thread -template <> void VerilatedTrace::offloadWorkerThreadMain() { +template <> void VerilatedTrace::offloadWorkerThreadMain() { bool shutdown = false; do { @@ -127,6 +127,8 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { const uint32_t* readp = bufferp; + std::unique_ptr traceBufp; // We own the passed tracebuffer + while (true) { const uint32_t cmd = readp[0]; const uint32_t top = cmd >> 4; @@ -141,44 +143,44 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { // CHG_* commands case VerilatedTraceOffloadCommand::CHG_BIT_0: VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top); - chgBitImpl(oldp, 0); + traceBufp->chgBitImpl(oldp, 0); continue; case VerilatedTraceOffloadCommand::CHG_BIT_1: VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top); - chgBitImpl(oldp, 1); + traceBufp->chgBitImpl(oldp, 1); continue; case VerilatedTraceOffloadCommand::CHG_CDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top); // Bits stored in bottom byte of command - chgCDataImpl(oldp, *readp, top); + traceBufp->chgCDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_SDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top); // Bits stored in bottom byte of command - chgSDataImpl(oldp, *readp, top); + traceBufp->chgSDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_IDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top); // Bits stored in bottom byte of command - chgIDataImpl(oldp, *readp, top); + traceBufp->chgIDataImpl(oldp, *readp, top); readp += 1; continue; case VerilatedTraceOffloadCommand::CHG_QDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top); // Bits stored in bottom byte of command - chgQDataImpl(oldp, *reinterpret_cast(readp), top); + traceBufp->chgQDataImpl(oldp, *reinterpret_cast(readp), top); readp += 2; continue; case VerilatedTraceOffloadCommand::CHG_WDATA: VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top); - chgWDataImpl(oldp, readp, top); + traceBufp->chgWDataImpl(oldp, readp, top); readp += VL_WORDS_I(top); continue; case VerilatedTraceOffloadCommand::CHG_DOUBLE: VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top); - chgDoubleImpl(oldp, *reinterpret_cast(readp)); + traceBufp->chgDoubleImpl(oldp, *reinterpret_cast(readp)); readp += 2; continue; @@ -191,9 +193,18 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { readp += 2; continue; + case VerilatedTraceOffloadCommand::TRACE_BUFFER: + VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top); + readp -= 1; // No code in this command, undo increment + traceBufp.reset(*reinterpret_cast(readp)); + readp += 2; + continue; + //=== // Commands ending this buffer - case VerilatedTraceOffloadCommand::END: VL_TRACE_OFFLOAD_DEBUG("Command END"); break; + case VerilatedTraceOffloadCommand::END: // + VL_TRACE_OFFLOAD_DEBUG("Command END"); + break; case VerilatedTraceOffloadCommand::SHUTDOWN: VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN"); shutdown = true; @@ -202,8 +213,7 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { //=== // Unknown command default: { // LCOV_EXCL_START - VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN"); - VL_PRINTF_MT("Trace command: 0x%08x\n", cmd); + VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd); VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command"); break; } // LCOV_EXCL_STOP @@ -221,7 +231,7 @@ template <> void VerilatedTrace::offloadWorkerThreadMain() { } while (VL_LIKELY(!shutdown)); } -template <> void VerilatedTrace::shutdownOffloadWorker() { +template <> void VerilatedTrace::shutdownOffloadWorker() { // If the worker thread is not running, done.. if (!m_workerThread) return; @@ -241,7 +251,7 @@ template <> void VerilatedTrace::shutdownOffloadWorker() { //============================================================================= // Life cycle -template <> void VerilatedTrace::closeBase() { +template <> void VerilatedTrace::closeBase() { #ifdef VL_TRACE_OFFLOAD shutdownOffloadWorker(); while (m_numOffloadBuffers) { @@ -251,7 +261,7 @@ template <> void VerilatedTrace::closeBase() { #endif } -template <> void VerilatedTrace::flushBase() { +template <> void VerilatedTrace::flushBase() { #ifdef VL_TRACE_OFFLOAD // Hand an empty buffer to the worker thread uint32_t* const bufferp = getOffloadBuffer(); @@ -266,46 +276,29 @@ template <> void VerilatedTrace::flushBase() { //============================================================================= // Callbacks to run on global events -template <> void VerilatedTrace::onFlush(void* selfp) { +template <> void VerilatedTrace::onFlush(void* selfp) { // This calls 'flush' on the derived class (which must then get any mutex) - reinterpret_cast(selfp)->flush(); + reinterpret_cast(selfp)->flush(); } -template <> void VerilatedTrace::onExit(void* selfp) { +template <> void VerilatedTrace::onExit(void* selfp) { // This calls 'close' on the derived class (which must then get any mutex) - reinterpret_cast(selfp)->close(); + reinterpret_cast(selfp)->close(); } //============================================================================= // VerilatedTrace -template <> -VerilatedTrace::VerilatedTrace() - : m_sigs_oldvalp{nullptr} - , m_sigs_enabledp{nullptr} - , m_timeLastDump{0} - , m_fullDump{true} - , m_nextCode{0} - , m_numSignals{0} - , m_maxBits{0} - , m_scopeEscape{'.'} - , m_timeRes{1e-9} - , m_timeUnit { - 1e-9 -} -#ifdef VL_TRACE_OFFLOAD -, m_numOffloadBuffers { 0 } -#endif -{ +template <> VerilatedTrace::VerilatedTrace() { set_time_unit(Verilated::threadContextp()->timeunitString()); set_time_resolution(Verilated::threadContextp()->timeprecisionString()); } -template <> VerilatedTrace::~VerilatedTrace() { +template <> VerilatedTrace::~VerilatedTrace() { if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr); if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr); - Verilated::removeFlushCb(VerilatedTrace::onFlush, this); - Verilated::removeExitCb(VerilatedTrace::onExit, this); + Verilated::removeFlushCb(VerilatedTrace::onFlush, this); + Verilated::removeExitCb(VerilatedTrace::onExit, this); #ifdef VL_TRACE_OFFLOAD closeBase(); #endif @@ -314,7 +307,7 @@ template <> VerilatedTrace::~VerilatedTrace() { //========================================================================= // Internals available to format specific implementations -template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { +template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { // Note: It is possible to re-open a trace file (VCD in particular), // so we must reset the next code here, but it must have the same number // of codes on re-open @@ -359,8 +352,8 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { } // Set callback so flush/abort will flush this file - Verilated::addFlushCb(VerilatedTrace::onFlush, this); - Verilated::addExitCb(VerilatedTrace::onExit, this); + Verilated::addFlushCb(VerilatedTrace::onFlush, this); + Verilated::addExitCb(VerilatedTrace::onExit, this); #ifdef VL_TRACE_OFFLOAD // Compute offload buffer size. we need to be able to store a new value for @@ -372,13 +365,13 @@ template <> void VerilatedTrace::traceInit() VL_MT_UNSAFE { // Start the worker thread m_workerThread.reset( - new std::thread{&VerilatedTrace::offloadWorkerThreadMain, this}); + new std::thread{&VerilatedTrace::offloadWorkerThreadMain, this}); #endif } template <> -bool VerilatedTrace::declCode(uint32_t code, const char* namep, uint32_t bits, - bool tri) { +bool VerilatedTrace::declCode(uint32_t code, const char* namep, uint32_t bits, + bool tri) { if (VL_UNCOVERABLE(!code)) { VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal"); } @@ -422,28 +415,30 @@ bool VerilatedTrace::declCode(uint32_t code, const char* namep, ui //========================================================================= // Internals available to format specific implementations -template <> std::string VerilatedTrace::timeResStr() const { +template <> std::string VerilatedTrace::timeResStr() const { return doubleToTimescale(m_timeRes); } //========================================================================= // External interface to client code -template <> void VerilatedTrace::set_time_unit(const char* unitp) VL_MT_SAFE { +template <> void VerilatedTrace::set_time_unit(const char* unitp) VL_MT_SAFE { m_timeUnit = timescaleToDouble(unitp); } -template <> void VerilatedTrace::set_time_unit(const std::string& unit) VL_MT_SAFE { +template <> +void VerilatedTrace::set_time_unit(const std::string& unit) VL_MT_SAFE { set_time_unit(unit.c_str()); } -template <> void VerilatedTrace::set_time_resolution(const char* unitp) VL_MT_SAFE { +template <> +void VerilatedTrace::set_time_resolution(const char* unitp) VL_MT_SAFE { m_timeRes = timescaleToDouble(unitp); } template <> -void VerilatedTrace::set_time_resolution(const std::string& unit) VL_MT_SAFE { +void VerilatedTrace::set_time_resolution(const std::string& unit) VL_MT_SAFE { set_time_resolution(unit.c_str()); } template <> -void VerilatedTrace::dumpvars(int level, const std::string& hier) VL_MT_SAFE { +void VerilatedTrace::dumpvars(int level, const std::string& hier) VL_MT_SAFE { if (level == 0) { m_dumpvars.clear(); // empty = everything on } else { @@ -456,7 +451,87 @@ void VerilatedTrace::dumpvars(int level, const std::string& hier) } } -template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) { +#ifdef VL_TRACE_PARALLEL +template <> // +void VerilatedTrace::parallelWorkerTask(void* datap, bool) { + ParallelWorkerData* const wdp = reinterpret_cast(datap); + // Run the task + wdp->m_cb(wdp->m_userp, wdp->m_bufp); + // Mark buffer as ready + const VerilatedLockGuard lock{wdp->m_mutex}; + wdp->m_ready.store(true); + if (wdp->m_waiting) wdp->m_cv.notify_one(); +} + +template <> VL_ATTR_NOINLINE void VerilatedTrace::ParallelWorkerData::wait() { + // Spin for a while, waiting for the buffer to become ready + for (int i = 0; i < VL_LOCK_SPINS; ++i) { + if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return; + VL_CPU_RELAX(); + } + // We have been spinning for a while, so yield the thread + VerilatedLockGuard lock{m_mutex}; + m_waiting = true; + m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); }); + m_waiting = false; +} +#endif + +template <> +void VerilatedTrace::runParallelCallbacks(const ParallelCallbackMap& cbMap) { + for (VlThreadPool* threadPoolp : m_threadPoolps) { +#ifdef VL_TRACE_PARALLEL + // If tracing in parallel, dispatch to the thread pool (if exists) + if (threadPoolp && threadPoolp->numThreads()) { + // List of work items for thread (std::list, as ParallelWorkerData is not movable) + std::list workerData; + // We use the whole pool + the main thread + const unsigned threads = threadPoolp->numThreads() + 1; + // Main thread executes all jobs with index % threads == 0 + std::vector mainThreadWorkerData; + // The tracing callbacks to execute on this thread-pool + const auto& cbVec = cbMap.at(threadPoolp); + // Enuque all the jobs + for (unsigned i = 0; i < cbVec.size(); ++i) { + const CallbackRecord& cbr = cbVec[i]; + // Always get the trace buffer on the main thread + Buffer* const bufp = getTraceBuffer(); + // Create new work item + workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp); + // Grab the new work item + ParallelWorkerData* const itemp = &workerData.back(); + // Enqueue task to thread pool, or main thread + if (unsigned rem = i % threads) { + threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false); + } else { + mainThreadWorkerData.push_back(itemp); + } + } + // Execute main thead jobs + for (ParallelWorkerData* const itemp : mainThreadWorkerData) { + parallelWorkerTask(itemp, false); + } + // Commit all trace buffers in order + for (ParallelWorkerData& item : workerData) { + // Wait until ready + item.wait(); + // Commit the buffer + commitTraceBuffer(item.m_bufp); + } + continue; + } +#endif + // Fall back on sequential execution + for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) { + Buffer* const traceBufferp = getTraceBuffer(); + cbr.m_dumpCb(cbr.m_userp, traceBufferp); + commitTraceBuffer(traceBufferp); + } + } +} + +template <> +void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) { // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE. // This does get the mutex, but if multiple threads are trying to dump // chances are the data being dumped will have other problems @@ -504,20 +579,14 @@ template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_ // Run the callbacks if (VL_UNLIKELY(m_fullDump)) { m_fullDump = false; // No more need for next dump to be full - for (uint32_t i = 0; i < m_fullCbs.size(); ++i) { - const CallbackRecord& cbr = m_fullCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); - } + runParallelCallbacks(m_fullCbs); } else { - for (uint32_t i = 0; i < m_chgCbs.size(); ++i) { - const CallbackRecord& cbr = m_chgCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); - } + runParallelCallbacks(m_chgCbs); } for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) { const CallbackRecord& cbr = m_cleanupCbs[i]; - cbr.m_dumpCb(cbr.m_userp, self()); + cbr.m_cleanupCb(cbr.m_userp, self()); } #ifdef VL_TRACE_OFFLOAD @@ -538,8 +607,18 @@ template <> void VerilatedTrace::dump(uint64_t timeui) VL_MT_SAFE_ // Non-hot path internal interface to Verilator generated code template <> -void VerilatedTrace::addCallbackRecord(std::vector& cbVec, - CallbackRecord& cbRec) +void VerilatedTrace::addThreadPool(VlThreadPool* threadPoolp) + VL_MT_SAFE_EXCLUDES(m_mutex) { + const VerilatedLockGuard lock{m_mutex}; + for (VlThreadPool* const poolp : m_threadPoolps) { + if (poolp == threadPoolp) return; + } + m_threadPoolps.push_back(threadPoolp); +} + +template <> +void VerilatedTrace::addCallbackRecord(std::vector& cbVec, + CallbackRecord& cbRec) VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; if (VL_UNCOVERABLE(timeLastDump() != 0)) { // LCOV_EXCL_START @@ -550,91 +629,40 @@ void VerilatedTrace::addCallbackRecord(std::vector cbVec.push_back(cbRec); } -template <> void VerilatedTrace::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; addCallbackRecord(m_initCbs, cbr); } -template <> void VerilatedTrace::addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addFullCb(dumpCb_t cb, void* userp, + VlThreadPool* threadPoolp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; - addCallbackRecord(m_fullCbs, cbr); + addThreadPool(threadPoolp); + addCallbackRecord(m_fullCbs[threadPoolp], cbr); } -template <> void VerilatedTrace::addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addChgCb(dumpCb_t cb, void* userp, + VlThreadPool* threadPoolp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; - addCallbackRecord(m_chgCbs, cbr); + addThreadPool(threadPoolp); + addCallbackRecord(m_chgCbs[threadPoolp], cbr); } -template <> void VerilatedTrace::addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE { +template <> +void VerilatedTrace::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE { CallbackRecord cbr{cb, userp}; addCallbackRecord(m_cleanupCbs, cbr); } -template <> void VerilatedTrace::pushNamePrefix(const std::string& prefix) { +template <> void VerilatedTrace::pushNamePrefix(const std::string& prefix) { m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix); } -template <> void VerilatedTrace::popNamePrefix(unsigned count) { +template <> void VerilatedTrace::popNamePrefix(unsigned count) { while (count--) m_namePrefixStack.pop_back(); assert(!m_namePrefixStack.empty()); } -//========================================================================= -// Hot path internal interface to Verilator generated code - -// These functions must write the new value back into the old value store, -// and subsequently call the format specific emit* implementations. Note -// that this file must be included in the format specific implementation, so -// the emit* functions can be inlined for performance. - -template <> void VerilatedTrace::fullBit(uint32_t* oldp, CData newval) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitBit(code, newval); -} - -template <> void VerilatedTrace::fullCData(uint32_t* oldp, CData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitCData(code, newval, bits); -} - -template <> void VerilatedTrace::fullSData(uint32_t* oldp, SData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitSData(code, newval, bits); -} - -template <> void VerilatedTrace::fullIData(uint32_t* oldp, IData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *oldp = newval; // Still copy even if not tracing so chg doesn't call full - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitIData(code, newval, bits); -} - -template <> void VerilatedTrace::fullQData(uint32_t* oldp, QData newval, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - *reinterpret_cast(oldp) = newval; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitQData(code, newval, bits); -} - -template <> -void VerilatedTrace::fullWData(uint32_t* oldp, const WData* newvalp, int bits) { - const uint32_t code = oldp - m_sigs_oldvalp; - for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i]; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - self()->emitWData(code, newvalp, bits); -} - -template <> void VerilatedTrace::fullDouble(uint32_t* oldp, double newval) { - const uint32_t code = oldp - m_sigs_oldvalp; - *reinterpret_cast(oldp) = newval; - if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; - // cppcheck-suppress invalidPointerCast - self()->emitDouble(code, newval); -} - //========================================================================= // Primitives converting binary values to strings... @@ -725,41 +753,86 @@ static inline void cvtQDataToStr(char* dstp, QData value) { #define cvtEDataToStr cvtIDataToStr -//============================================================================= +//========================================================================= +// VerilatedTraceBuffer -#ifdef VERILATED_VCD_TEST - -void verilated_trace_imp_selftest() { -#define SELF_CHECK(got, exp) \ - do { \ - if ((got) != (exp)) VL_FATAL_MT(__FILE__, __LINE__, "", "%Error: selftest"); \ - } while (0) - -#define SELF_CHECK_TS(scale) \ - SELF_CHECK(doubleToTimescale(timescaleToDouble(scale)), std::string{scale}); - SELF_CHECK_TS("100s"); - SELF_CHECK_TS("10s"); - SELF_CHECK_TS("1s"); - SELF_CHECK_TS("100ms"); - SELF_CHECK_TS("10ms"); - SELF_CHECK_TS("1ms"); - SELF_CHECK_TS("100us"); - SELF_CHECK_TS("10us"); - SELF_CHECK_TS("1us"); - SELF_CHECK_TS("100ns"); - SELF_CHECK_TS("10ns"); - SELF_CHECK_TS("1ns"); - SELF_CHECK_TS("100ps"); - SELF_CHECK_TS("10ps"); - SELF_CHECK_TS("1ps"); - SELF_CHECK_TS("100fs"); - SELF_CHECK_TS("10fs"); - SELF_CHECK_TS("1fs"); - SELF_CHECK_TS("100as"); - SELF_CHECK_TS("10as"); - SELF_CHECK_TS("1as"); +template <> // +VerilatedTraceBuffer::VerilatedTraceBuffer(VL_SUB_T& owner) + : m_owner{owner} { +#ifdef VL_TRACE_OFFLOAD + if (m_offloadBufferWritep) { + using This = VerilatedTraceBuffer*; + // Tack on the buffer address + static_assert(2 * sizeof(uint32_t) >= sizeof(This), + "This should be enough on all plafrorms"); + *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER; + *reinterpret_cast(m_offloadBufferWritep) = this; + m_offloadBufferWritep += 2; + } +#endif } -#endif +// These functions must write the new value back into the old value store, +// and subsequently call the format specific emit* implementations. Note +// that this file must be included in the format specific implementation, so +// the emit* functions can be inlined for performance. + +template <> // +void VerilatedTraceBuffer::fullBit(uint32_t* oldp, CData newval) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitBit(code, newval); +} + +template <> +void VerilatedTraceBuffer::fullCData(uint32_t* oldp, CData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitCData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullSData(uint32_t* oldp, SData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitSData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullIData(uint32_t* oldp, IData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *oldp = newval; // Still copy even if not tracing so chg doesn't call full + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitIData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullQData(uint32_t* oldp, QData newval, int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + *reinterpret_cast(oldp) = newval; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitQData(code, newval, bits); +} + +template <> +void VerilatedTraceBuffer::fullWData(uint32_t* oldp, const WData* newvalp, + int bits) { + const uint32_t code = oldp - m_sigs_oldvalp; + for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i]; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + self()->emitWData(code, newvalp, bits); +} + +template <> +void VerilatedTraceBuffer::fullDouble(uint32_t* oldp, double newval) { + const uint32_t code = oldp - m_sigs_oldvalp; + *reinterpret_cast(oldp) = newval; + if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return; + // cppcheck-suppress invalidPointerCast + self()->emitDouble(code, newval); +} #endif // VL_CPPCHECK diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp index 78383befc..9db71aabc 100644 --- a/include/verilated_vcd_c.cpp +++ b/include/verilated_vcd_c.cpp @@ -62,12 +62,23 @@ constexpr unsigned VL_TRACE_MAX_VCD_CODE_SIZE = 5; // Maximum length of a VCD s // cache-lines. constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8; // Size of a suffix entry +//============================================================================= +// Utility functions: TODO: put these in a common place and share them. + +template static size_t roundUpToMultipleOf(size_t value) { + static_assert((N & (N - 1)) == 0, "'N' must be a power of 2"); + size_t mask = N - 1; + return (value + mask) & ~mask; +} + //============================================================================= // Specialization of the generics for this trace format -#define VL_DERIVED_T VerilatedVcd -#include "verilated_trace_imp.cpp" -#undef VL_DERIVED_T +#define VL_SUB_T VerilatedVcd +#define VL_BUF_T VerilatedVcdBuffer +#include "verilated_trace_imp.h" +#undef VL_SUB_T +#undef VL_BUF_T //============================================================================= //============================================================================= @@ -183,7 +194,7 @@ void VerilatedVcd::makeNameMap() { deleteNameMap(); m_namemapp = new NameMap; - VerilatedTrace::traceInit(); + Super::traceInit(); // Though not speced, it's illegal to generate a vcd with signals // not under any module - it crashes at least two viewers. @@ -218,13 +229,17 @@ VerilatedVcd::~VerilatedVcd() { if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr); deleteNameMap(); if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr); +#ifdef VL_TRACE_PARALLEL + assert(m_numBuffers == m_freeBuffers.size()); + for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr); +#endif } void VerilatedVcd::closePrev() { // This function is on the flush() call path if (!isOpen()) return; - VerilatedTrace::flushBase(); + Super::flushBase(); bufferFlush(); m_isOpen = false; m_filep->close(); @@ -251,14 +266,14 @@ void VerilatedVcd::close() VL_MT_SAFE_EXCLUDES(m_mutex) { printStr(" $end\n"); } closePrev(); - // closePrev() called VerilatedTrace::flush(), so we just + // closePrev() called Super::flush(), so we just // need to shut down the tracing thread here. - VerilatedTrace::closeBase(); + Super::closeBase(); } void VerilatedVcd::flush() VL_MT_SAFE_EXCLUDES(m_mutex) { const VerilatedLockGuard lock{m_mutex}; - VerilatedTrace::flushBase(); + Super::flushBase(); bufferFlush(); } @@ -277,12 +292,12 @@ void VerilatedVcd::printQuad(uint64_t n) { printStr(buf); } -void VerilatedVcd::bufferResize(uint64_t minsize) { +void VerilatedVcd::bufferResize(size_t minsize) { // minsize is size of largest write. We buffer at least 8 times as much data, // writing when we are 3/4 full (with thus 2*minsize remaining free) if (VL_UNLIKELY(minsize > m_wrChunkSize)) { const char* oldbufp = m_wrBufp; - m_wrChunkSize = minsize * 2; + m_wrChunkSize = roundUpToMultipleOf<1024>(minsize * 2); m_wrBufp = new char[m_wrChunkSize * 8]; std::memcpy(m_wrBufp, oldbufp, m_writep - oldbufp); m_writep = m_wrBufp + (m_writep - oldbufp); @@ -463,14 +478,16 @@ void VerilatedVcd::declare(uint32_t code, const char* name, const char* wirep, b int arraynum, bool tri, bool bussed, int msb, int lsb) { const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1; - const bool enabled = VerilatedTrace::declCode(code, name, bits, tri); + const bool enabled = Super::declCode(code, name, bits, tri); if (m_suffixes.size() <= nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE) { m_suffixes.resize(nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0); } - // Make sure write buffer is large enough (one character per bit), plus header - bufferResize(bits + 1024); + // Keep upper bound on bytes a single signal cna emit into the buffer + m_maxSignalBytes = std::max(m_maxSignalBytes, bits + 32); + // Make sure write buffer is large enough, plus header + bufferResize(m_maxSignalBytes + 1024); if (!enabled) return; @@ -562,26 +579,73 @@ void VerilatedVcd::declArray(uint32_t code, const char* name, bool array, int ar void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int arraynum) { declare(code, name, "real", array, arraynum, false, false, 63, 0); } -#ifdef VL_TRACE_VCD_OLD_API -void VerilatedVcd::declTriBit(uint32_t code, const char* name, bool array, int arraynum) { - declare(code, name, "wire", array, arraynum, true, false, 0, 0); -} -void VerilatedVcd::declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb, - int lsb) { - declare(code, name, "wire", array, arraynum, true, true, msb, lsb); -} -void VerilatedVcd::declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, - int lsb) { - declare(code, name, "wire", array, arraynum, true, true, msb, lsb); -} -void VerilatedVcd::declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb, - int lsb) { - declare(code, name, "wire", array, arraynum, true, true, msb, lsb); -} -#endif // VL_TRACE_VCD_OLD_API //============================================================================= -// Trace rendering prinitives +// Get/commit trace buffer + +VerilatedVcdBuffer* VerilatedVcd::getTraceBuffer() { +#ifdef VL_TRACE_PARALLEL + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // If no buffer available, allocate a new one + if (m_freeBuffers.empty()) { + constexpr size_t pageSize = 4096; + // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety + size_t startingSize = roundUpToMultipleOf(4 * m_maxSignalBytes); + m_freeBuffers.emplace_back(new char[startingSize], startingSize); + ++m_numBuffers; + } + // Grab a buffer + const auto pair = m_freeBuffers.back(); + m_freeBuffers.pop_back(); + // Return the buffer + return new VerilatedVcdBuffer{*this, pair.first, pair.second}; +#else + return new VerilatedVcdBuffer{*this}; +#endif +} + +void VerilatedVcd::commitTraceBuffer(VerilatedVcdBuffer* bufp) { +#ifdef VL_TRACE_PARALLEL + // Note: This is called from VeriltedVcd::dump, which already holds the lock + // Resize output buffer. Note, we use the full size of the trace buffer, as + // this is a lot more stable than the actual occupancy of the trace buffer. + // This helps us to avoid re-allocations due to small size changes. + bufferResize(bufp->m_size); + // Compute occupancy of buffer + const size_t usedSize = bufp->m_writep - bufp->m_bufp; + // Copy to output buffer + std::memcpy(m_writep, bufp->m_bufp, usedSize); + // Adjust write pointer + m_writep += usedSize; + // Flush if necessary + bufferCheck(); + // Put buffer back on free list + m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size); +#else + // Needs adjusting for emitTimeChange + m_writep = bufp->m_writep; +#endif + delete bufp; +} + +//============================================================================= +// VerilatedVcdBuffer implementation + +#ifdef VL_TRACE_PARALLEL +VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size) + : VerilatedTraceBuffer{owner} + , m_writep{bufp} + , m_bufp{bufp} + , m_size{size} { + adjustGrowp(); +} +#else +VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner) + : VerilatedTraceBuffer{owner} {} +#endif + +//============================================================================= +// Trace rendering primitives static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* suffixp) VL_ATTR_NO_SANITIZE_ALIGN; @@ -606,26 +670,55 @@ static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* s #endif } -void VerilatedVcd::finishLine(uint32_t code, char* writep) { - const char* const suffixp = m_suffixes.data() + code * VL_TRACE_SUFFIX_ENTRY_SIZE; +void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) { + const char* const suffixp = m_suffixes + code * VL_TRACE_SUFFIX_ENTRY_SIZE; VL_DEBUG_IFDEF(assert(suffixp[0]);); VerilatedVcdCCopyAndAppendNewLine(writep, suffixp); // Now write back the write pointer incremented by the actual size of the // suffix, which was stored in the last byte of the suffix buffer entry. m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1]; - bufferCheck(); + +#ifdef VL_TRACE_PARALLEL + // Double the size of the buffer if necessary + if (VL_UNLIKELY(m_writep >= m_growp)) { + // Compute occupied size of current buffer + const size_t usedSize = m_writep - m_bufp; + // We are always doubling the size + m_size *= 2; + // Allocate the new buffer + char* const newBufp = new char[m_size]; + // Copy from current buffer to new buffer + std::memcpy(newBufp, m_bufp, usedSize); + // Delete current buffer + delete[] m_bufp; + // Make new buffer the current buffer + m_bufp = newBufp; + // Adjust write pointer + m_writep = m_bufp + usedSize; + // Adjust resize limit + adjustGrowp(); + } +#else + // Flush the write buffer if there's not enough space left for new information + // We only call this once per vector, so we need enough slop for a very wide "b###" line + if (VL_UNLIKELY(m_writep > m_wrFlushp)) { + m_owner.m_writep = m_writep; + m_owner.bufferFlush(); + m_writep = m_owner.m_writep; + } +#endif } //============================================================================= // emit* trace routines // Note: emit* are only ever called from one place (full* in -// verilated_trace_imp.cpp, which is included in this file at the top), +// verilated_trace_imp.h, which is included in this file at the top), // so always inline them. VL_ATTR_ALWINLINE -void VerilatedVcd::emitBit(uint32_t code, CData newval) { +void VerilatedVcdBuffer::emitBit(uint32_t code, CData newval) { // Don't prefetch suffix as it's a bit too late; char* wp = m_writep; *wp++ = '0' | static_cast(newval); @@ -633,7 +726,7 @@ void VerilatedVcd::emitBit(uint32_t code, CData newval) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) { +void VerilatedVcdBuffer::emitCData(uint32_t code, CData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits)); @@ -641,7 +734,7 @@ void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) { +void VerilatedVcdBuffer::emitSData(uint32_t code, SData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits)); @@ -649,7 +742,7 @@ void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) { +void VerilatedVcdBuffer::emitIData(uint32_t code, IData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits)); @@ -657,7 +750,7 @@ void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) { +void VerilatedVcdBuffer::emitQData(uint32_t code, QData newval, int bits) { char* wp = m_writep; *wp++ = 'b'; cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits)); @@ -665,7 +758,7 @@ void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) { +void VerilatedVcdBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) { int words = VL_WORDS_I(bits); char* wp = m_writep; *wp++ = 'b'; @@ -682,272 +775,10 @@ void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) { } VL_ATTR_ALWINLINE -void VerilatedVcd::emitDouble(uint32_t code, double newval) { +void VerilatedVcdBuffer::emitDouble(uint32_t code, double newval) { char* wp = m_writep; // Buffer can't overflow before VL_SNPRINTF; we sized during declaration - VL_SNPRINTF(wp, m_wrChunkSize, "r%.16g", newval); + VL_SNPRINTF(wp, m_maxSignalBytes, "r%.16g", newval); wp += std::strlen(wp); finishLine(code, wp); } - -#ifdef VL_TRACE_VCD_OLD_API - -void VerilatedVcd::fullBit(uint32_t code, const uint32_t newval) { - // Note the &1, so we don't require clean input -- makes more common no change case faster - *oldp(code) = newval; - *m_writep++ = ('0' + static_cast(newval & 1)); - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullBus(uint32_t code, const uint32_t newval, int bits) { - *oldp(code) = newval; - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = ((newval & (1L << bit)) ? '1' : '0'); - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullQuad(uint32_t code, const uint64_t newval, int bits) { - (*(reinterpret_cast(oldp(code)))) = newval; - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = ((newval & (1ULL << bit)) ? '1' : '0'); - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullArray(uint32_t code, const uint32_t* newval, int bits) { - for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { oldp(code)[word] = newval[word]; } - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = ((newval[(bit / 32)] & (1L << (bit & 0x1f))) ? '1' : '0'); - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullArray(uint32_t code, const uint64_t* newval, int bits) { - for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) { oldp(code)[word] = newval[word]; } - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = ((newval[(bit / 64)] & (1ULL << (bit & 0x3f))) ? '1' : '0'); - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) { - oldp(code)[0] = newval; - oldp(code)[1] = newtri; - *m_writep++ = "01zz"[newval | (newtri << 1)]; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, - int bits) { - oldp(code)[0] = newval; - oldp(code)[1] = newtri; - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = "01zz"[((newval >> bit) & 1) | (((newtri >> bit) & 1) << 1)]; - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, - int bits) { - (*(reinterpret_cast(oldp(code)))) = newval; - (*(reinterpret_cast(oldp(code + 1)))) = newtri; - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - *m_writep++ = "01zz"[((newval >> bit) & 1ULL) | (((newtri >> bit) & 1ULL) << 1ULL)]; - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, - int bits) { - for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { - oldp(code)[word * 2] = newvalp[word]; - oldp(code)[word * 2 + 1] = newtrip[word]; - } - *m_writep++ = 'b'; - for (int bit = bits - 1; bit >= 0; --bit) { - uint32_t valbit = (newvalp[(bit / 32)] >> (bit & 0x1f)) & 1; - uint32_t tribit = (newtrip[(bit / 32)] >> (bit & 0x1f)) & 1; - *m_writep++ = "01zz"[valbit | (tribit << 1)]; - } - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} -void VerilatedVcd::fullDouble(uint32_t code, const double newval) { - // cppcheck-suppress invalidPointerCast - (*(reinterpret_cast(oldp(code)))) = newval; - // Buffer can't overflow before VL_SNPRINTF; we sized during declaration - VL_SNPRINTF(m_writep, m_wrChunkSize, "r%.16g", newval); - m_writep += std::strlen(m_writep); - *m_writep++ = ' '; - m_writep = writeCode(m_writep, code); - *m_writep++ = '\n'; - bufferCheck(); -} - -#endif // VL_TRACE_VCD_OLD_API - -//====================================================================== -//====================================================================== -//====================================================================== - -#ifdef VERILATED_VCD_TEST -#include - -extern void verilated_trace_imp_selftest(); - -uint32_t v1, v2, s1, s2[3]; -uint32_t tri96[3]; -uint32_t tri96__tri[3]; -uint64_t quad96[2]; -uint64_t tquad; -uint64_t tquad__tri; -uint8_t ch; -uint64_t timestamp = 1; -double doub = 0.0; -float flo = 0.0f; - -void vcdInit(void*, VerilatedVcd* vcdp, uint32_t) { - vcdp->scopeEscape('.'); - vcdp->pushNamePrefix("top."); - /**/ vcdp->declBus(0x2, "v1", -1, 0, 5, 1); - /**/ vcdp->declBus(0x3, "v2", -1, 0, 6, 1); - /**/ vcdp->pushNamePrefix("sub1."); - /***/ vcdp->declBit(0x4, "s1", -1, 0); - /***/ vcdp->declBit(0x5, "ch", -1, 0); - /**/ vcdp->popNamePrefix(); - /**/ vcdp->pushNamePrefix("sub2."); - /***/ vcdp->declArray(0x6, "s2", -1, 0, 40, 3); - /**/ vcdp->popNamePrefix(); - vcdp->popNamePrefix(); - // Note need to add 3 for next code. - vcdp->pushNamePrefix("top2."); - /**/ vcdp->declBus(0x2, "t2v1", -1, 0, 4, 1); - /**/ vcdp->declTriBit(0x10, "io1", -1, 0); - /**/ vcdp->declTriBus(0x12, "io5", -1, 0, 4, 0); - /**/ vcdp->declTriArray(0x16, "io96", -1, 0, 95, 0); - /**/ // Note need to add 6 for next code. - /**/ vcdp->declDouble(0x1c, "doub", -1, 0); - /**/ // Note need to add 2 for next code. - /**/ vcdp->declArray(0x20, "q2", -1, 0, 95, 0); - /**/ // Note need to add 4 for next code. - /**/ vcdp->declTriQuad(0x24, "tq", -1, 0, 63, 0); - /**/ // Note need to add 4 for next code. - vcdp->popNamePrefix(); -} - -void vcdFull(void*, VerilatedVcd* vcdp) { - vcdp->fullBus(0x2, v1, 5); - vcdp->fullBus(0x3, v2, 7); - vcdp->fullBit(0x4, s1); - vcdp->fullBus(0x5, ch, 2); - vcdp->fullArray(0x6, &s2[0], 38); - vcdp->fullTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1); - vcdp->fullTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5); - vcdp->fullTriArray(0x16, tri96, tri96__tri, 96); - vcdp->fullDouble(0x1c, doub); - vcdp->fullArray(0x20, &quad96[0], 96); - vcdp->fullTriQuad(0x24, tquad, tquad__tri, 64); -} - -void vcdChange(void*, VerilatedVcd* vcdp) { - vcdp->chgBus(0x2, v1, 5); - vcdp->chgBus(0x3, v2, 7); - vcdp->chgBit(0x4, s1); - vcdp->chgBus(0x5, ch, 2); - vcdp->chgArray(0x6, &s2[0], 38); - vcdp->chgTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1); - vcdp->chgTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5); - vcdp->chgTriArray(0x16, tri96, tri96__tri, 96); - vcdp->chgDouble(0x1c, doub); - vcdp->chgArray(0x20, &quad96[0], 96); - vcdp->chgTriQuad(0x24, tquad, tquad__tri, 64); -} - -// clang-format off -void vcdTestMain(const char* filenamep) { - verilated_trace_imp_selftest(); - - v1 = v2 = s1 = 0; - s2[0] = s2[1] = s2[2] = 0; - tri96[2] = tri96[1] = tri96[0] = 0; - tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0; - quad96[1] = quad96[0] = 0; - ch = 0; - doub = 0; - tquad = tquad__tri = 0; - { - VerilatedVcdC* vcdp = new VerilatedVcdC; - vcdp->evcd(true); - vcdp->set_time_unit("1ms"); - vcdp->set_time_unit(std::string{"1ms"}); - vcdp->set_time_resolution("1ns"); - vcdp->set_time_resolution(std::string{"1ns"}); - vcdp->spTrace()->addInitCb(&vcdInit, 0); - vcdp->spTrace()->addFullCb(&vcdFull, 0); - vcdp->spTrace()->addChgCb(&vcdChange, 0); - vcdp->open(filenamep); - // Dumping - vcdp->dump(++timestamp); - v1 = 0xfff; - tri96[2] = 4; tri96[1] = 2; tri96[0] = 1; - tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0; // Still tri - quad96[1] = 0xffffffff; quad96[0] = 0; - doub = 1.5; - flo = 1.4f; - vcdp->dump(++timestamp); - v2 = 0x1; - s2[1] = 2; - tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = 0; // enable w/o data change - quad96[1] = 0; quad96[0] = ~0; - doub = -1.66e13; - flo = 0.123f; - tquad = 0x00ff00ff00ff00ffULL; - tquad__tri = 0x0000fffff0000ffffULL; - vcdp->dump(++timestamp); - ch = 2; - tri96[2] = ~4; tri96[1] = ~2; tri96[0] = ~1; - doub = -3.33e-13; - vcdp->dump(++timestamp); - vcdp->dump(++timestamp); -# ifdef VERILATED_VCD_TEST_64BIT - const uint64_t bytesPerDump = 15ULL; - for (uint64_t i = 0; i < ((1ULL << 32) / bytesPerDump); i++) { - v1 = i; - vcdp->dump(++timestamp); - } -# endif - vcdp->close(); - VL_DO_CLEAR(delete vcdp, vcdp = nullptr); - } -} -#endif -// clang-format on - -//******************************************************************** -// ;compile-command: "v4make test_regress/t/t_trace_c_api.pl" -// -// Local Variables: -// End: diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h index 5fbb6022c..0d83eb25d 100644 --- a/include/verilated_vcd_c.h +++ b/include/verilated_vcd_c.h @@ -28,39 +28,20 @@ #include #include -class VerilatedVcd; - -//============================================================================= -// VerilatedFile -/// Class representing a file to write to. These virtual methods can be -/// overrode for e.g. socket I/O. - -class VerilatedVcdFile VL_NOT_FINAL { -private: - int m_fd = 0; // File descriptor we're writing to -public: - // METHODS - /// Construct a (as yet) closed file - VerilatedVcdFile() = default; - /// Close and destruct - virtual ~VerilatedVcdFile() = default; - /// Open a file with given filename - virtual bool open(const std::string& name) VL_MT_UNSAFE; - /// Close object's file - virtual void close() VL_MT_UNSAFE; - /// Write data to file (if it is open) - virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE; -}; +class VerilatedVcdBuffer; +class VerilatedVcdFile; //============================================================================= // VerilatedVcd // Base class to create a Verilator VCD dump // This is an internally used class - see VerilatedVcdC for what to call from applications -class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace { +class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace { +public: + using Super = VerilatedTrace; + private: - // Give the superclass access to private bits (to avoid virtual functions) - friend class VerilatedTrace; + friend Buffer; // Give the buffer access to the private bits //========================================================================= // VCD specific internals @@ -74,9 +55,10 @@ private: int m_modDepth = 0; // Depth of module hierarchy char* m_wrBufp; // Output buffer - const char* m_wrFlushp; // Output buffer flush trigger location + char* m_wrFlushp; // Output buffer flush trigger location char* m_writep; // Write pointer into output buffer - uint64_t m_wrChunkSize; // Output buffer size + size_t m_wrChunkSize; // Output buffer size + size_t m_maxSignalBytes = 0; // Upper bound on number of bytes a single signal can generate uint64_t m_wroteBytes = 0; // Number of bytes written to this file std::vector m_suffixes; // VCD line end string codes + metadata @@ -84,7 +66,13 @@ private: using NameMap = std::map; NameMap* m_namemapp = nullptr; // List of names for the header - void bufferResize(uint64_t minsize); +#ifdef VL_TRACE_PARALLEL + // Vector of free trace buffers as (pointer, size) pairs. + std::vector> m_freeBuffers; + size_t m_numBuffers = 0; // Number of trace buffers allocated +#endif + + void bufferResize(size_t minsize); void bufferFlush() VL_MT_UNSAFE_ONE; inline void bufferCheck() { // Flush the write buffer if there's not enough space left for new information @@ -107,8 +95,6 @@ private: static char* writeCode(char* writep, uint32_t code); - void finishLine(uint32_t code, char* writep); - // CONSTRUCTORS VL_UNCOPYABLE(VerilatedVcd); @@ -116,27 +102,22 @@ protected: //========================================================================= // Implementation of VerilatedTrace interface - // Implementations of protected virtual methods for VerilatedTrace + // Called when the trace moves forward to a new time point virtual void emitTimeChange(uint64_t timeui) override; // Hooks called from VerilatedTrace virtual bool preFullDump() override { return isOpen(); } virtual bool preChangeDump() override; - // Implementations of duck-typed methods for VerilatedTrace. These are - // called from only one place (namely full*) so always inline them. - inline void emitBit(uint32_t code, CData newval); - inline void emitCData(uint32_t code, CData newval, int bits); - inline void emitSData(uint32_t code, SData newval, int bits); - inline void emitIData(uint32_t code, IData newval, int bits); - inline void emitQData(uint32_t code, QData newval, int bits); - inline void emitWData(uint32_t code, const WData* newvalp, int bits); - inline void emitDouble(uint32_t code, double newval); + // Trace buffer management + virtual VerilatedVcdBuffer* getTraceBuffer() override; + virtual void commitTraceBuffer(VerilatedVcdBuffer*) override; public: //========================================================================= // External interface to client code + // CONSTRUCTOR explicit VerilatedVcd(VerilatedVcdFile* filep = nullptr); ~VerilatedVcd(); @@ -144,7 +125,7 @@ public: // Set size in megabytes after which new file should be created void rolloverMB(uint64_t rolloverMB) { m_rolloverMB = rolloverMB; } - // METHODS + // METHODS - All must be thread safe // Open the file; call isOpen() to see if errors void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex); // Open next data-only file @@ -164,168 +145,95 @@ public: void declQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); void declArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); void declDouble(uint32_t code, const char* name, bool array, int arraynum); - -#ifdef VL_TRACE_VCD_OLD_API - //========================================================================= - // Note: These are only for testing for backward compatibility with foreign - // code and is not used by Verilator. Do not use these as there is no - // guarantee of functionality. - - void declTriBit(uint32_t code, const char* name, bool array, int arraynum); - void declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); - void declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); - void declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb); - - void fullBit(uint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); } - void fullCData(uint32_t* oldp, CData newval, int bits) { - fullBus(oldp - this->oldp(0), newval, bits); - } - void fullSData(uint32_t* oldp, SData newval, int bits) { - fullBus(oldp - this->oldp(0), newval, bits); - } - void fullIData(uint32_t* oldp, IData newval, int bits) { - fullBus(oldp - this->oldp(0), newval, bits); - } - void fullQData(uint32_t* oldp, QData newval, int bits) { - fullQuad(oldp - this->oldp(0), newval, bits); - } - void fullWData(uint32_t* oldp, const WData* newvalp, int bits) { - fullArray(oldp - this->oldp(0), newvalp, bits); - } - void fullDouble(uint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); } - - inline void chgBit(uint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); } - inline void chgCData(uint32_t* oldp, CData newval, int bits) { - chgBus(oldp - this->oldp(0), newval, bits); - } - inline void chgSData(uint32_t* oldp, SData newval, int bits) { - chgBus(oldp - this->oldp(0), newval, bits); - } - inline void chgIData(uint32_t* oldp, IData newval, int bits) { - chgBus(oldp - this->oldp(0), newval, bits); - } - inline void chgQData(uint32_t* oldp, QData newval, int bits) { - chgQuad(oldp - this->oldp(0), newval, bits); - } - inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) { - chgArray(oldp - this->oldp(0), newvalp, bits); - } - inline void chgDouble(uint32_t* oldp, double newval) { - chgDouble(oldp - this->oldp(0), newval); - } - - // Inside dumping routines, dump one signal, faster when not inlined - // due to code size reduction. - void fullBit(uint32_t code, const uint32_t newval); - void fullBus(uint32_t code, const uint32_t newval, int bits); - void fullQuad(uint32_t code, const uint64_t newval, int bits); - void fullArray(uint32_t code, const uint32_t* newvalp, int bits); - void fullArray(uint32_t code, const uint64_t* newvalp, int bits); - void fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri); - void fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits); - void fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits); - void fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, int bits); - void fullDouble(uint32_t code, const double newval); - - // Inside dumping routines, dump one signal if it has changed. - // We do want to inline these to avoid calls when the value did not change. - inline void chgBit(uint32_t code, const uint32_t newval) { - const uint32_t diff = oldp(code)[0] ^ newval; - if (VL_UNLIKELY(diff)) fullBit(code, newval); - } - inline void chgBus(uint32_t code, const uint32_t newval, int bits) { - const uint32_t diff = oldp(code)[0] ^ newval; - if (VL_UNLIKELY(diff)) { - if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) { - fullBus(code, newval, bits); - } - } - } - inline void chgQuad(uint32_t code, const uint64_t newval, int bits) { - const uint64_t diff = (*(reinterpret_cast(oldp(code)))) ^ newval; - if (VL_UNLIKELY(diff)) { - if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) { - fullQuad(code, newval, bits); - } - } - } - inline void chgArray(uint32_t code, const uint32_t* newvalp, int bits) { - for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { - if (VL_UNLIKELY(oldp(code)[word] ^ newvalp[word])) { - fullArray(code, newvalp, bits); - return; - } - } - } - inline void chgArray(uint32_t code, const uint64_t* newvalp, int bits) { - for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) { - if (VL_UNLIKELY(*(reinterpret_cast(oldp(code + 2 * word))) - ^ newvalp[word])) { - fullArray(code, newvalp, bits); - return; - } - } - } - inline void chgTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) { - const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri)); - if (VL_UNLIKELY(diff)) { - // Verilator 3.510 and newer provide clean input, so the below - // is only for back compatibility - if (VL_UNLIKELY(diff & 1)) { // Change after clean? - fullTriBit(code, newval, newtri); - } - } - } - inline void chgTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits) { - const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri)); - if (VL_UNLIKELY(diff)) { - if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) { - fullTriBus(code, newval, newtri, bits); - } - } - } - inline void chgTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits) { - const uint64_t diff = (((*(reinterpret_cast(oldp(code)))) ^ newval) - | ((*(reinterpret_cast(oldp(code + 1)))) ^ newtri)); - if (VL_UNLIKELY(diff)) { - if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) { - fullTriQuad(code, newval, newtri, bits); - } - } - } - inline void chgTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, - int bits) { - for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { - if (VL_UNLIKELY((oldp(code)[word * 2] ^ newvalp[word]) - | (oldp(code)[word * 2 + 1] ^ newtrip[word]))) { - fullTriArray(code, newvalp, newtrip, bits); - return; - } - } - } - inline void chgDouble(uint32_t code, const double newval) { - // cppcheck-suppress invalidPointerCast - if (VL_UNLIKELY((*(reinterpret_cast(oldp(code)))) != newval)) { - fullDouble(code, newval); - } - } - - // METHODS - // Old/standalone API only - void evcd(bool flag) { m_evcd = flag; } -#endif // VL_TRACE_VCD_OLD_API }; #ifndef DOXYGEN -// Declare specializations here they are used in VerilatedVcdC just below -template <> void VerilatedTrace::dump(uint64_t timeui); -template <> void VerilatedTrace::set_time_unit(const char* unitp); -template <> void VerilatedTrace::set_time_unit(const std::string& unit); -template <> void VerilatedTrace::set_time_resolution(const char* unitp); -template <> void VerilatedTrace::set_time_resolution(const std::string& unit); -template <> void VerilatedTrace::dumpvars(int level, const std::string& hier); +// Declare specialization here as it's used in VerilatedFstC just below +template <> void VerilatedVcd::Super::dump(uint64_t time); +template <> void VerilatedVcd::Super::set_time_unit(const char* unitp); +template <> void VerilatedVcd::Super::set_time_unit(const std::string& unit); +template <> void VerilatedVcd::Super::set_time_resolution(const char* unitp); +template <> void VerilatedVcd::Super::set_time_resolution(const std::string& unit); +template <> void VerilatedVcd::Super::dumpvars(int level, const std::string& hier); #endif // DOXYGEN +//============================================================================= +// VerilatedVcdBuffer + +class VerilatedVcdBuffer final : public VerilatedTraceBuffer { + // Give the trace file access to the private bits + friend VerilatedVcd; + friend VerilatedVcd::Super; + +#ifdef VL_TRACE_PARALLEL + char* m_writep; // Write pointer into m_bufp + char* m_bufp; // The beginning of the trace buffer + size_t m_size; // The size of the buffer at m_bufp + char* m_growp; // Resize limit pointer +#else + char* m_writep = m_owner.m_writep; // Write pointer into output buffer + char* const m_wrFlushp = m_owner.m_wrFlushp; // Output buffer flush trigger location +#endif + + // VCD line end string codes + metadata + const char* const m_suffixes = m_owner.m_suffixes.data(); + // The maximum number of bytes a single signal can emit + const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes; + + void finishLine(uint32_t code, char* writep); + +#ifdef VL_TRACE_PARALLEL + void adjustGrowp() { + m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes); + assert(m_growp >= m_bufp + m_maxSignalBytes); + } +#endif + +public: + // CONSTRUCTOR +#ifdef VL_TRACE_PARALLEL + explicit VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size); +#else + explicit VerilatedVcdBuffer(VerilatedVcd& owner); +#endif + ~VerilatedVcdBuffer() = default; + + //========================================================================= + // Implementation of VerilatedTraceBuffer interface + + // Implementations of duck-typed methods for VerilatedTraceBuffer. These are + // called from only one place (the full* methods), so always inline them. + VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval); + VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits); + VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits); + VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits); + VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits); + VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits); + VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval); +}; + +//============================================================================= +// VerilatedFile +/// Class representing a file to write to. These virtual methods can be +/// overrode for e.g. socket I/O. + +class VerilatedVcdFile VL_NOT_FINAL { +private: + int m_fd = 0; // File descriptor we're writing to +public: + // METHODS + /// Construct a (as yet) closed file + VerilatedVcdFile() = default; + /// Close and destruct + virtual ~VerilatedVcdFile() = default; + /// Open a file with given filename + virtual bool open(const std::string& name) VL_MT_UNSAFE; + /// Close object's file + virtual void close() VL_MT_UNSAFE; + /// Write data to file (if it is open) + virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE; +}; + //============================================================================= // VerilatedVcdC /// Class representing a VCD dump file in C standalone (no SystemC) @@ -396,16 +304,6 @@ public: // Internal class access inline VerilatedVcd* spTrace() { return &m_sptrace; } - -#ifdef VL_TRACE_VCD_OLD_API - //========================================================================= - // Note: These are only for testing for backward compatibility with foreign - // code and is not used by Verilator. Do not use these as there is no - // guarantee of functionality. - - // Use evcd format - void evcd(bool flag) VL_MT_UNSAFE_ONE { m_sptrace.evcd(flag); } -#endif }; #endif // guard diff --git a/include/verilatedos.h b/include/verilatedos.h index 28412cac4..6bacfe27b 100644 --- a/include/verilatedos.h +++ b/include/verilatedos.h @@ -40,6 +40,7 @@ #ifdef __GNUC__ # define VL_ATTR_ALIGNED(alignment) __attribute__((aligned(alignment))) # define VL_ATTR_ALWINLINE __attribute__((always_inline)) +# define VL_ATTR_NOINLINE __attribute__((noinline)) # define VL_ATTR_COLD __attribute__((cold)) # define VL_ATTR_HOT __attribute__((hot)) # define VL_ATTR_NORETURN __attribute__((noreturn)) @@ -82,6 +83,9 @@ #ifndef VL_ATTR_ALWINLINE # define VL_ATTR_ALWINLINE ///< Attribute to inline, even when not optimizing #endif +#ifndef VL_ATTR_NOINLINE +# define VL_ATTR_NOINLINE ///< Attribute to never inline, even when optimizing +#endif #ifndef VL_ATTR_COLD # define VL_ATTR_COLD ///< Attribute that function is rarely executed #endif diff --git a/src/V3AstNodes.h b/src/V3AstNodes.h index b200d121b..7d3189551 100644 --- a/src/V3AstNodes.h +++ b/src/V3AstNodes.h @@ -8533,6 +8533,7 @@ public: AstNodeDType* childDTypep() const { return VN_AS(op1p(), NodeDType); } void childDTypep(AstNodeDType* nodep) { setOp1p(nodep); } AstNode* itemsp() const { return op2p(); } // op2 = AstPatReplicate, AstPatMember, etc + void addItemsp(AstNode* nodep) { addOp2p(nodep); } }; class AstPatMember final : public AstNodeMath { // Verilog '{a} or '{a{b}} diff --git a/src/V3AstUserAllocator.h b/src/V3AstUserAllocator.h index d230f0829..8d63ad5a9 100644 --- a/src/V3AstUserAllocator.h +++ b/src/V3AstUserAllocator.h @@ -106,7 +106,7 @@ public: } // Get a reference to the user data - T_Data& operator()(const T_Node* nodep) { + T_Data& operator()(const T_Node* nodep) const { T_Data* const userp = getUserp(nodep); UASSERT_OBJ(userp, nodep, "Missing User data on const AstNode"); return *userp; diff --git a/src/V3Case.cpp b/src/V3Case.cpp index 161f7db7e..c65fb3e7d 100644 --- a/src/V3Case.cpp +++ b/src/V3Case.cpp @@ -496,7 +496,7 @@ private: V3Case::caseLint(nodep); iterateChildren(nodep); if (debug() >= 9) nodep->dumpTree(cout, " case_old: "); - if (isCaseTreeFast(nodep) && v3Global.opt.oCase()) { + if (isCaseTreeFast(nodep) && v3Global.opt.fCase()) { // It's a simple priority encoder or complete statement // we can make a tree of statements to avoid extra comparisons ++m_statCaseFast; diff --git a/src/V3Const.cpp b/src/V3Const.cpp index 2cf230a7f..bfd6919df 100644 --- a/src/V3Const.cpp +++ b/src/V3Const.cpp @@ -111,6 +111,15 @@ class ConstBitOpTreeVisitor final : public VNVisitor { BitPolarityEntry() = default; }; + struct FrozenNodeInfo final { // Context when a frozen node is found + bool m_polarity; + int m_lsb; + bool operator<(const FrozenNodeInfo& other) const { + if (m_lsb != other.m_lsb) return m_lsb < other.m_lsb; + return m_polarity < other.m_polarity; + } + }; + class Restorer final { // Restore the original state unless disableRestore() is called ConstBitOpTreeVisitor& m_visitor; const size_t m_polaritiesSize; @@ -299,7 +308,8 @@ class ConstBitOpTreeVisitor final : public VNVisitor { LeafInfo* m_leafp = nullptr; // AstConst or AstVarRef that currently looking for const AstNode* const m_rootp; // Root of this AST subtree - std::vector m_frozenNodes; // Nodes that cannot be optimized + std::vector> + m_frozenNodes; // Nodes that cannot be optimized std::vector m_bitPolarities; // Polarity of bits found during iterate() std::vector> m_varInfos; // VarInfo for each variable, [0] is nullptr @@ -487,7 +497,7 @@ class ConstBitOpTreeVisitor final : public VNVisitor { restorer.restoreNow(); // Reach past a cast then add to frozen nodes to be added to final reduction if (const AstCCast* const castp = VN_CAST(opp, CCast)) opp = castp->lhsp(); - m_frozenNodes.push_back(opp); + m_frozenNodes.emplace_back(opp, FrozenNodeInfo{m_polarity, m_lsb}); m_failed = origFailed; continue; } @@ -652,17 +662,21 @@ public: } } + std::map> frozenNodes; // Group by FrozenNodeInfo // Check if frozen terms are clean or not - for (AstNode* const termp : visitor.m_frozenNodes) { + for (const auto& frozenInfo : visitor.m_frozenNodes) { + AstNode* const termp = frozenInfo.first; // Comparison operators are clean - if (VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte) - || VN_IS(termp, Gt) || VN_IS(termp, Gte)) { + if ((VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte) + || VN_IS(termp, Gt) || VN_IS(termp, Gte)) + && frozenInfo.second.m_lsb == 0) { hasCleanTerm = true; } else { // Otherwise, conservatively assume the frozen term is dirty hasDirtyTerm = true; UINFO(9, "Dirty frozen term: " << termp << endl); } + frozenNodes[frozenInfo.second].push_back(termp); } // Figure out if a final negation is required @@ -672,7 +686,12 @@ public: const bool needsCleaning = visitor.isAndTree() ? !hasCleanTerm : hasDirtyTerm; // Add size of reduction tree to op count - resultOps += termps.size() + visitor.m_frozenNodes.size() - 1; + resultOps += termps.size() - 1; + for (const auto& lsbAndNodes : frozenNodes) { + if (lsbAndNodes.first.m_lsb > 0) ++resultOps; // Needs AstShiftR + if (!lsbAndNodes.first.m_polarity) ++resultOps; // Needs AstNot + resultOps += lsbAndNodes.second.size(); + } // Add final polarity flip in Xor tree if (needsFlip) ++resultOps; // Add final cleaning AND @@ -681,7 +700,10 @@ public: if (debug() >= 9) { // LCOV_EXCL_START cout << "Bitop tree considered: " << endl; for (AstNode* const termp : termps) termp->dumpTree("Reduced term: "); - for (AstNode* const termp : visitor.m_frozenNodes) termp->dumpTree("Frozen term: "); + for (const std::pair& termp : visitor.m_frozenNodes) + termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second.m_lsb) + + " polarity " + std::to_string(termp.second.m_polarity) + + ": "); cout << "Needs flipping: " << needsFlip << endl; cout << "Needs cleaning: " << needsCleaning << endl; cout << "Size: " << resultOps << " input size: " << visitor.m_ops << endl; @@ -724,8 +746,25 @@ public: resultp = reduce(resultp, termp); } // Add any frozen terms to the reduction - for (AstNode* const frozenp : visitor.m_frozenNodes) { - resultp = reduce(resultp, frozenp->unlinkFrBack()); + for (auto&& nodes : frozenNodes) { + // nodes.second has same lsb and polarity + AstNode* termp = nullptr; + for (AstNode* const itemp : nodes.second) { + termp = reduce(termp, itemp->unlinkFrBack()); + } + if (nodes.first.m_lsb > 0) { // LSB is not 0, so shiftR + AstNodeDType* const dtypep = termp->dtypep(); + termp = new AstShiftR{termp->fileline(), termp, + new AstConst(termp->fileline(), AstConst::WidthedValue{}, + termp->width(), nodes.first.m_lsb)}; + termp->dtypep(dtypep); + } + if (!nodes.first.m_polarity) { // Polarity is inverted, so append Not + AstNodeDType* const dtypep = termp->dtypep(); + termp = new AstNot{termp->fileline(), termp}; + termp->dtypep(dtypep); + } + resultp = reduce(resultp, termp); } // Set width of masks to expected result width. This is required to prevent later removal @@ -1051,7 +1090,7 @@ private: bool matchBitOpTree(AstNode* nodep) { if (nodep->widthMin() != 1) return false; - if (!v3Global.opt.oConstBitOpTree()) return false; + if (!v3Global.opt.fConstBitOpTree()) return false; string debugPrefix; if (debug() >= 9) { // LCOV_EXCL_START @@ -1373,7 +1412,7 @@ private: return (VN_IS(nodep, And) || VN_IS(nodep, Or) || VN_IS(nodep, Xor)); } bool ifAdjacentSel(const AstSel* lhsp, const AstSel* rhsp) { - if (!v3Global.opt.oAssemble()) return false; // opt disabled + if (!v3Global.opt.fAssemble()) return false; // opt disabled if (!lhsp || !rhsp) return false; const AstNode* const lfromp = lhsp->fromp(); const AstNode* const rfromp = rhsp->fromp(); @@ -1388,7 +1427,7 @@ private: } bool ifMergeAdjacent(AstNode* lhsp, AstNode* rhsp) { // called by concatmergeable to determine if {lhsp, rhsp} make sense - if (!v3Global.opt.oAssemble()) return false; // opt disabled + if (!v3Global.opt.fAssemble()) return false; // opt disabled // two same varref if (operandsSame(lhsp, rhsp)) return true; const AstSel* lselp = VN_CAST(lhsp, Sel); @@ -1425,7 +1464,7 @@ private: } bool concatMergeable(const AstNode* lhsp, const AstNode* rhsp, unsigned depth) { // determine if {a OP b, c OP d} => {a, c} OP {b, d} is advantageous - if (!v3Global.opt.oAssemble()) return false; // opt disabled + if (!v3Global.opt.fAssemble()) return false; // opt disabled if (lhsp->type() != rhsp->type()) return false; if (!ifConcatMergeableBiop(lhsp)) return false; if (depth > CONCAT_MERGABLE_MAX_DEPTH) return false; // As worse case O(n^2) algorithm @@ -2511,7 +2550,7 @@ private: if (nodep->access().isReadOnly() && ((!m_params // Can reduce constant wires into equations && m_doNConst - && v3Global.opt.oConst() + && v3Global.opt.fConst() // Default value, not a "known" constant for this usage && !nodep->varp()->isClassMember() && !(nodep->varp()->isFuncLocal() && nodep->varp()->isNonOutput()) diff --git a/src/V3EmitCImp.cpp b/src/V3EmitCImp.cpp index e07648a2d..2cc813751 100644 --- a/src/V3EmitCImp.cpp +++ b/src/V3EmitCImp.cpp @@ -752,26 +752,26 @@ class EmitCTrace final : EmitCFunc { const string func = nodep->full() ? "full" : "chg"; bool emitWidth = true; if (nodep->dtypep()->basicp()->isDouble()) { - puts("tracep->" + func + "Double"); + puts("bufp->" + func + "Double"); emitWidth = false; } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) { - puts("tracep->" + func + "WData"); + puts("bufp->" + func + "WData"); } else if (nodep->isQuad()) { - puts("tracep->" + func + "QData"); + puts("bufp->" + func + "QData"); } else if (nodep->declp()->widthMin() > 16) { - puts("tracep->" + func + "IData"); + puts("bufp->" + func + "IData"); } else if (nodep->declp()->widthMin() > 8) { - puts("tracep->" + func + "SData"); + puts("bufp->" + func + "SData"); } else if (nodep->declp()->widthMin() > 1) { - puts("tracep->" + func + "CData"); + puts("bufp->" + func + "CData"); } else { - puts("tracep->" + func + "Bit"); + puts("bufp->" + func + "Bit"); emitWidth = false; } const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords()); const uint32_t code = nodep->declp()->code() + offset; - puts(v3Global.opt.useTraceOffloadThread() && !nodep->full() ? "(base+" : "(oldp+"); + puts(v3Global.opt.useTraceOffload() && !nodep->full() ? "(base+" : "(oldp+"); puts(cvtToStr(code - nodep->baseCode())); puts(","); emitTraceValue(nodep, arrayindex); diff --git a/src/V3EmitCMake.cpp b/src/V3EmitCMake.cpp index 67e8a741c..7df71dfeb 100644 --- a/src/V3EmitCMake.cpp +++ b/src/V3EmitCMake.cpp @@ -113,9 +113,8 @@ class CMakeEmitter final { cmake_set_raw(*of, name + "_COVERAGE", v3Global.opt.coverage() ? "1" : "0"); *of << "# Threaded output mode? 0/1/N threads (from --threads)\n"; cmake_set_raw(*of, name + "_THREADS", cvtToStr(v3Global.opt.threads())); - *of << "# Threaded tracing output mode? 0/1/N threads (from --trace-threads)\n"; - cmake_set_raw(*of, name + "_TRACE_THREADS", - cvtToStr(v3Global.opt.useTraceOffloadThread())); + *of << "# Threaded tracing output mode? 0/1/N threads (from --threads/--trace-threads)\n"; + cmake_set_raw(*of, name + "_TRACE_THREADS", cvtToStr(v3Global.opt.vmTraceThreads())); cmake_set_raw(*of, name + "_TRACE_FST_WRITER_THREAD", v3Global.opt.traceThreads() && v3Global.opt.traceFormat().fst() ? "1" : "0"); *of << "# Struct output mode? 0/1 (from --trace-structs)\n"; diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp index 429b78d33..b748d9553 100644 --- a/src/V3EmitMk.cpp +++ b/src/V3EmitMk.cpp @@ -73,9 +73,10 @@ public: of.puts("VM_TRACE_FST = "); of.puts(v3Global.opt.trace() && v3Global.opt.traceFormat().fst() ? "1" : "0"); of.puts("\n"); - of.puts("# Tracing threaded output mode? 0/1/N threads (from --trace-thread)\n"); + of.puts( + "# Tracing threaded output mode? 0/1/N threads (from --threads/--trace-thread)\n"); of.puts("VM_TRACE_THREADS = "); - of.puts(cvtToStr(v3Global.opt.useTraceOffloadThread())); + of.puts(cvtToStr(v3Global.opt.vmTraceThreads())); of.puts("\n"); of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n"); of.puts("VM_TRACE_FST_WRITER_THREAD = "); diff --git a/src/V3Gate.cpp b/src/V3Gate.cpp index 4b66c2661..cf3485121 100644 --- a/src/V3Gate.cpp +++ b/src/V3Gate.cpp @@ -397,11 +397,11 @@ private: // Then propagate more complicated equations optimizeSignals(true); // Remove redundant logic - if (v3Global.opt.oDedupe()) { + if (v3Global.opt.fDedupe()) { dedupe(); if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_dedup"); } - if (v3Global.opt.oAssemble()) { + if (v3Global.opt.fAssemble()) { mergeAssigns(); if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_assm"); } diff --git a/src/V3GraphAcyc.cpp b/src/V3GraphAcyc.cpp index a62fd3d9d..0df758ed1 100644 --- a/src/V3GraphAcyc.cpp +++ b/src/V3GraphAcyc.cpp @@ -254,7 +254,7 @@ void GraphAcyc::simplify(bool allowCut) { if (allowCut) { // The main algorithm works without these, though slower // So if changing the main algorithm, comment these out for a test run - if (v3Global.opt.oAcycSimp()) { + if (v3Global.opt.fAcycSimp()) { cutBasic(vertexp); cutBackward(vertexp); } diff --git a/src/V3MergeCond.cpp b/src/V3MergeCond.cpp index 673326f27..3881c48df 100644 --- a/src/V3MergeCond.cpp +++ b/src/V3MergeCond.cpp @@ -42,6 +42,34 @@ // // Also merges consecutive AstNodeIf statements with the same condition. // +// Because this optimization has notable performance impact, we go further +// and perform code motion to try to move mergeable conditionals next to each +// other, which in turn enable us to merge more conditionals. To do this, we +// perform an analysis pass, followed by an optimization pass on the whole +// AstCFunc we are optimizing. +// +// The analysis pass gathers, for each statement in the tree, the information +// relevant for determining whether two statements can be swapped, and some +// other additional information that is useful during optimization. +// +// The optimization pass tries to move conditionals near each other, first by +// trying to move a conditional node backwards in the list, so it becomes the +// direct successor of another earlier conditional with the same condition. +// If this is not possible due to variable interference, then we additionally +// try to pull earlier conditionals with the same condition closer forward to +// be the immediate predecessor of the conditional node. We limit maximum +// distance a node can travel to an empirically chosen but otherwise arbitrary +// constant. This limits worst case complexity to be O(n) rather than O(n^2). +// The worst case complexity manifests when N/2 conditionals, all with unique +// conditions are succeeded by N/2 conditionals with the same unique +// conditions, such that each unique condition is used by exactly 2 +// conditionals. In this case N/2 all nodes need to travel approx N/2 distance. +// Limiting the distance bounds the latter, hence limiting complexity. +// +// Once the analysis and optimization passes have been applied to the whole +// function, any merged conditionals will then undergo the same analysis, +// optimization, and merging again in their individual branches. +// //************************************************************************* #include "config_build.h" @@ -51,71 +79,364 @@ #include "V3MergeCond.h" #include "V3Stats.h" #include "V3Ast.h" +#include "V3AstUserAllocator.h" +#include "V3Hasher.h" +#include "V3DupFinder.h" + +#include +#include + +namespace { //###################################################################### +// Utilities -enum class Mergeable { - YES, // Tree can be merged - NO_COND_ASSIGN, // Tree cannot be merged because it contains an assignment to a condition - NO_IMPURE // Tree cannot be merged because it contains an impure node +// This function extracts the Cond node from the RHS of an assignment, +// if there is one and it is in a supported position, which are: +// - RHS is the Cond +// - RHS is And(Const, Cond). This And is inserted often by V3Clean. +AstNodeCond* extractCondFromRhs(AstNode* rhsp) { + if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) { + return condp; + } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) { + if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) { + if (VN_IS(andp->lhsp(), Const)) return condp; + } + } + return nullptr; +} + +// Predicate to check if two sets are disjoint. This is stable, as we only need +// to determine if the sets contain a shared element, which is a boolean +// property. It is also efficient as we use sorted sets, and therefore can +// enumerate elements in order (what the ordering is, is unimportant), meaning +// the worst case complexity is O(size of smaller set). +bool areDisjoint(const std::set& a, const std::set& b) { + if (a.empty() || b.empty()) return true; + const auto endA = a.end(); + const auto endB = b.end(); + auto itA = a.begin(); + auto itB = b.begin(); + while (true) { + if (*itA == *itB) return false; + if (std::less{}(*itA, *itB)) { + itA = std::lower_bound(++itA, endA, *itB); + if (itA == endA) return true; + } else { + itB = std::lower_bound(++itB, endB, *itA); + if (itB == endB) return true; + } + } +} + +//###################################################################### +// Structure containing information required for code motion/merging + +struct StmtProperties { + AstNode* m_condp = nullptr; // The condition expression, if a conditional node + std::set m_rdVars; // Variables read by this statement + std::set m_wrVars; // Variables writen by this statement + bool m_isFence = false; // Nothing should move across this statement, nor should it be merged + AstNodeStmt* m_prevWithSameCondp = nullptr; // Previous node in same list, with same condition + bool writesConditionVar() const { + // This relies on MarkVarsVisitor having been called on the condition node + for (const AstVar* const varp : m_wrVars) { + if (varp->user1()) return true; + } + return false; + } }; -class CheckMergeableVisitor final : public VNVisitor { -private: - // STATE - bool m_condAssign = false; // Does this tree contain an assignment to a condition variable?? - bool m_impure = false; // Does this tree contain an impure node? +// We store the statement properties in user3 via AstUser3Allocator +using StmtPropertiesAllocator = AstUser3Allocator; - // METHODS - VL_DEBUG_FUNC; // Declare debug() +//###################################################################### +// Code motion analysis and implementation - // VISITORS - virtual void visit(AstNode* nodep) override { - if (m_impure) return; - // Clear if node is impure - if (!nodep->isPure()) { - UINFO(9, "Not mergeable due to impure node" << nodep << endl); - m_impure = true; - return; +// Pure analysis visitor that build the StmtProperties for each statement in the given +// AstNode list (following AstNode::nextp()) +class CodeMotionAnalysisVisitor final : public VNVisitor { + // NODE STATE + // AstNodeStmt::user3 -> StmtProperties (accessed via m_stmtProperties, managed externally, + // see MergeCondVisitor::process) + // AstNode::user4 -> Used by V3Hasher + // AstNode::user5 -> AstNode*: Set on a condition node, points to the last conditional + // with that condition so far encountered in the same AstNode list + + VNUser5InUse m_user5InUse; + + StmtPropertiesAllocator& m_stmtProperties; + + // MEMBERS + V3Hasher m_hasher; // Used by V3DupFinder + // Stack of a V3DupFinder used for finding identical condition expressions within one + // statement list. + std::vector m_stack; + StmtProperties* m_propsp = nullptr; // StmtProperties structure of current AstNodeStmt + + // Extract condition expression from a megeable conditional statement, if any + static AstNode* extractCondition(const AstNodeStmt* nodep) { + AstNode* conditionp = nullptr; + if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) { + if (AstNodeCond* const conditionalp = extractCondFromRhs(assignp->rhsp())) { + conditionp = conditionalp->condp(); + } + } else if (const AstNodeIf* const ifp = VN_CAST(nodep, NodeIf)) { + conditionp = ifp->condp(); } + while (AstCCast* const castp = VN_CAST(conditionp, CCast)) conditionp = castp->lhsp(); + return conditionp; + } + + void analyzeStmt(AstNodeStmt* nodep, bool tryCondMatch) { + VL_RESTORER(m_propsp); + // Keep hold of props of enclosing statement + StmtProperties* const outerPropsp = m_propsp; + // Grab the props of this statement + m_propsp = &m_stmtProperties(nodep); + + // Extract condition from statement + if (AstNode* const condp = extractCondition(nodep)) { + // Remember condition node. We always need this as it is used in the later + // traversal. + m_propsp->m_condp = condp; + // If this is a conditional statement, try to find an earlier one with the same + // condition in the same list (unless we have been told not to bother because we know + // this node is in a singleton list). + if (tryCondMatch) { + // Grab the duplicate finder of this list + V3DupFinder& dupFinder = m_stack.back(); + // Find a duplicate condition + const V3DupFinder::iterator& dit = dupFinder.findDuplicate(condp); + if (dit == dupFinder.end()) { + // First time seeing this condition in the current list + dupFinder.insert(condp); + // Remember last statement with this condition (which is this statement) + condp->user5p(nodep); + } else { + // Seen a conditional with the same condition earlier in the current list + AstNode* const firstp = dit->second; + // Add to properties for easy retrieval during optimization + m_propsp->m_prevWithSameCondp = static_cast(firstp->user5p()); + // Remember last statement with this condition (which is this statement) + firstp->user5p(nodep); + } + } + } + + // Analyse this statement + analyzeNode(nodep); + + // If there is an enclosing statement, propagate properties upwards + if (outerPropsp) { + // Add all rd/wr vars to outer statement + outerPropsp->m_rdVars.insert(m_propsp->m_rdVars.cbegin(), m_propsp->m_rdVars.cend()); + outerPropsp->m_wrVars.insert(m_propsp->m_wrVars.cbegin(), m_propsp->m_wrVars.cend()); + // If this statement is impure, the enclosing statement is also impure + if (m_propsp->m_isFence) outerPropsp->m_isFence = true; + } + } + + void analyzeVarRef(AstVarRef* nodep) { + const VAccess access = nodep->access(); + AstVar* const varp = nodep->varp(); + // Gather read and written variables + if (access.isReadOrRW()) m_propsp->m_rdVars.insert(varp); + if (access.isWriteOrRW()) m_propsp->m_wrVars.insert(varp); + } + + void analyzeNode(AstNode* nodep) { + // If an impure node under a statement, mark that statement as impure + if (m_propsp && !nodep->isPure()) m_propsp->m_isFence = true; + // Analyze children iterateChildrenConst(nodep); } - virtual void visit(AstVarRef* nodep) override { - if (m_impure || m_condAssign) return; - // Clear if it's an LValue referencing a marked variable - if (nodep->access().isWriteOrRW() && nodep->varp()->user1()) { - UINFO(9, "Not mergeable due assignment to condition" << nodep << endl); - m_condAssign = true; + + // VISITORS + void visit(AstNode* nodep) override { + // Push a new stack entry at the start of a list, but only if the list is not a + // single element (this saves a lot of allocations in expressions) + bool singletonListStart = false; + if (nodep->backp()->nextp() != nodep) { // If at head of list + singletonListStart = nodep->nextp() == nullptr; + if (!singletonListStart) m_stack.emplace_back(m_hasher); } + + // Analyse node + if (AstNodeStmt* const stmtp = VN_CAST(nodep, NodeStmt)) { + analyzeStmt(stmtp, /*tryCondMatch:*/ !singletonListStart); + } else if (AstVarRef* const vrefp = VN_CAST(nodep, VarRef)) { + analyzeVarRef(vrefp); + } else { + analyzeNode(nodep); + } + + // Pop the stack at the end of a list + if (!singletonListStart && !nodep->nextp()) m_stack.pop_back(); + } + + // CONSTRUCTOR + CodeMotionAnalysisVisitor(AstNode* nodep, StmtPropertiesAllocator& stmtProperties) + : m_stmtProperties(stmtProperties) { + iterateAndNextConstNull(nodep); } public: - CheckMergeableVisitor() = default; - - // Return false if this node should not be merged at all because: - // - It contains an impure expression - // - It contains an LValue referencing the condition - Mergeable operator()(const AstNode* node) { - m_condAssign = false; - m_impure = false; - iterateChildrenConst(const_cast(node)); - if (m_impure) { // Impure is stronger than cond assign - return Mergeable::NO_IMPURE; - } else if (m_condAssign) { - return Mergeable::NO_COND_ASSIGN; - } else { - return Mergeable::YES; - } + // Analyse the statement list starting at nodep, filling in stmtProperties. + static void analyze(AstNode* nodep, StmtPropertiesAllocator& stmtProperties) { + CodeMotionAnalysisVisitor{nodep, stmtProperties}; } }; +class CodeMotionOptimizeVisitor final : public VNVisitor { + // Do not move a node more than this many statements. + // This bounds complexity at O(N), rather than O(N^2). + static constexpr unsigned MAX_DISTANCE = 500; + + // NODE STATE + // AstNodeStmt::user3 -> StmtProperties (accessed via m_stmtProperties, managed externally, + // see MergeCondVisitor::process) + // AstNodeStmt::user4 -> bool: Already processed this node + + VNUser4InUse m_user4InUse; + + const StmtPropertiesAllocator& m_stmtProperties; + + // MEMBERS + + // Predicate that checks if the order of two statements can be swapped + bool areSwappable(const AstNodeStmt* ap, const AstNodeStmt* bp) const { + const StmtProperties& aProps = m_stmtProperties(ap); + const StmtProperties& bProps = m_stmtProperties(bp); + // Don't move across fences + if (aProps.m_isFence) return false; + if (bProps.m_isFence) return false; + // If either statement writes a variable that the other reads, they are not swappable + if (!areDisjoint(aProps.m_rdVars, bProps.m_wrVars)) return false; + if (!areDisjoint(bProps.m_rdVars, aProps.m_wrVars)) return false; + // If they both write to the same variable, they are not swappable + if (!areDisjoint(aProps.m_wrVars, bProps.m_wrVars)) return false; + // Otherwise good to go + return true; + } + + // VISITORS + void visit(AstNodeStmt* nodep) override { + // Process only on first encounter + if (nodep->user4SetOnce()) return; + // First re-order children + iterateChildren(nodep); + // Grab hold of previous node with same condition + AstNodeStmt* prevp = m_stmtProperties(nodep).m_prevWithSameCondp; + // If no previous node with same condition, we are done + if (!prevp) return; +#ifdef VL_DEBUG + { // Sanity check, only in debug build, otherwise expensive + const AstNode* currp = prevp; + while (currp && currp != nodep) currp = currp->nextp(); + UASSERT_OBJ(currp, nodep, "Predecessor not in same list as " << currp); + } +#endif + // Otherwise try to move this node backwards, as close as we can to the previous node + // with the same condition + if (AstNodeStmt* predp = VN_CAST(nodep->backp(), NodeStmt)) { + // 'predp' is the newly computed predecessor node of 'nodep', which is initially + // (without movement) the 'backp' of the node. + for (unsigned i = MAX_DISTANCE; i; --i) { + // If the predecessor is the previous node with the same condition, job done + if (predp == prevp) break; + // Don't move past a non-statement (e.g.: AstVar), or end of list + AstNodeStmt* const backp = VN_CAST(predp->backp(), NodeStmt); + if (!backp) break; + // Don't swap statements if doing so would change program semantics + if (!areSwappable(predp, nodep)) break; + // Otherwise move 'nodep' back + predp = backp; + } + + // If we decided that 'nodep' should be moved back + if (nodep->backp() != predp) { + // Move the current node to directly follow the computed predecessor + nodep->unlinkFrBack(); + predp->addNextHere(nodep); + // If the predecessor is the previous node with the same condition, job done + if (predp == prevp) return; + } + } + // If we reach here, it means we were unable to move the current node all the way back + // such that it immediately follows the previous statement with the same condition. Now + // try to move all previous statements with the same condition forward, in the hope of + // compacting the list further. + for (AstNodeStmt* currp = nodep; prevp; + currp = prevp, prevp = m_stmtProperties(currp).m_prevWithSameCondp) { + // Move prevp (previous statement with same condition) towards currp + if (AstNodeStmt* succp = VN_CAST(prevp->nextp(), NodeStmt)) { + // 'succp' is the newly computed successor node of 'prevp', which is initially + // (without movement) the 'nextp' of the node. + for (unsigned i = MAX_DISTANCE; --i;) { + // If the successor of the previous statement with same condition is the + // target node, we are done with this predecessor + if (succp == currp) break; + // Don't move past a non-statement (e.g.: AstVar), or end of list + AstNodeStmt* const nextp = VN_CAST(succp->nextp(), NodeStmt); + if (!nextp) break; + // Don't swap statements if doing so would change program semantics + if (!areSwappable(prevp, succp)) break; + // Otherwise move further forward + succp = nextp; + } + + // If we decided that 'prevp' should be moved forward + if (prevp->nextp() != succp) { + // Move the current node to directly before the computed successor + prevp->unlinkFrBack(); + succp->addHereThisAsNext(prevp); + } + } + } + } + + void visit(AstNode* nodep) override {} // Ignore all non-statements + + // CONSTRUCTOR + CodeMotionOptimizeVisitor(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties) + : m_stmtProperties(stmtProperties) { + // We assert the given node is at the head of the list otherwise we might move a node + // before the given node. This is easy to fix in the above iteration with a check on a + // boundary node we should not move past, if we ever need to do so. + // Note: we will do iterateAndNextNull which requires nodep->backp() != nullptr anyway + UASSERT_OBJ(nodep->backp()->nextp() != nodep, nodep, "Must be at head of list"); + // Optimize the list + iterateAndNextNull(nodep); + } + +public: + // Given an AstNode list (held via AstNode::nextp()), move conditional statements as close + // together as possible + static AstNode* optimize(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties) { + CodeMotionOptimizeVisitor{nodep, stmtProperties}; + // It is possible for the head of the list to be moved later such that it is no longer + // in head position. If so, rewind the list and return the new head. + while (nodep->backp()->nextp() == nodep) nodep = nodep->backp(); + return nodep; + } +}; + +//###################################################################### +// Conditional merging + class MergeCondVisitor final : public VNVisitor { private: // NODE STATE - // AstVar::user1 -> Flag set for variables referenced by m_mgCondp - // AstNode::user2 -> Flag marking node as included in merge because cheap to duplicate - const VNUser1InUse m_user1InUse; - const VNUser2InUse m_user2InUse; + // AstVar::user1 -> bool: Set for variables referenced by m_mgCondp + // (Only below MergeCondVisitor::process). + // AstNode::user2 -> bool: Marking node as included in merge because cheap to + // duplicate + // (Only below MergeCondVisitor::process). + // AstNodeStmt::user3 -> StmtProperties + // (Only below MergeCondVisitor::process). + // AstNode::user4 -> See CodeMotionAnalysisVisitor/CodeMotionOptimizeVisitor + // AstNode::user5 -> See CodeMotionAnalysisVisitor // STATE VDouble0 m_statMerges; // Statistic tracking @@ -128,24 +449,84 @@ private: const AstNode* m_mgNextp = nullptr; // Next node in list being examined uint32_t m_listLenght = 0; // Length of current list - CheckMergeableVisitor m_checkMergeable; // Sub visitor for encapsulation & speed + std::queue* m_workQueuep = nullptr; // Node lists (via AstNode::nextp()) to merge + // Statement properties for code motion and merging + StmtPropertiesAllocator* m_stmtPropertiesp = nullptr; // METHODS VL_DEBUG_FUNC; // Declare debug() - // This function extracts the Cond node from the RHS, if there is one and - // it is in a supported position, which are: - // - RHS is the Cond - // - RHS is And(Const, Cond). This And is inserted often by V3Clean. - static AstNodeCond* extractCond(AstNode* rhsp) { - if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) { - return condp; - } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) { - if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) { - if (VN_IS(andp->lhsp(), Const)) return condp; - } + // Function that processes a whole sub-tree + void process(AstNode* nodep) { + // Set up work queue + std::queue workQueue; + m_workQueuep = &workQueue; + m_workQueuep->push(nodep); + + do { + // Set up user* for this iteration + const VNUser1InUse user1InUse; + const VNUser2InUse user2InUse; + const VNUser3InUse user3InUse; + // Statement properties only preserved for this iteration, + // then memory is released immediately. + StmtPropertiesAllocator stmtProperties; + m_stmtPropertiesp = &stmtProperties; + + // Pop off current work item + AstNode* currp = m_workQueuep->front(); + m_workQueuep->pop(); + + // Analyse sub-tree list for code motion + CodeMotionAnalysisVisitor::analyze(currp, stmtProperties); + // Perform the code motion within the whole sub-tree list + currp = CodeMotionOptimizeVisitor::optimize(currp, stmtProperties); + + // Merge conditionals in the whole sub-tree list (this might create new work items) + iterateAndNextNull(currp); + + // Close pending merge, if there is one at the end of the whole sub-tree list + if (m_mgFirstp) mergeEnd(); + } while (!m_workQueuep->empty()); + } + + // Skip past AstArraySel and AstWordSel with const index + static AstNode* skipConstSels(AstNode* nodep) { + while (const AstArraySel* const aselp = VN_CAST(nodep, ArraySel)) { + // ArraySel index is not constant, so might be expensive + if (!VN_IS(aselp->bitp(), Const)) return nodep; + nodep = aselp->fromp(); } - return nullptr; + while (const AstWordSel* const wselp = VN_CAST(nodep, WordSel)) { + // WordSel index is not constant, so might be expensive + if (!VN_IS(wselp->bitp(), Const)) return nodep; + nodep = wselp->fromp(); + } + return nodep; + } + + // Check if this node is cheap enough that duplicating it in two branches of an + // AstIf is not likely to cause a performance degradation. + static bool isCheapNode(AstNode* nodep) { + // Comments are cheap + if (VN_IS(nodep, Comment)) return true; + // So are some assignments + if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) { + // Check LHS + AstNode* const lhsp = skipConstSels(assignp->lhsp()); + // LHS is not a VarRef, so might be expensive + if (!VN_IS(lhsp, VarRef)) return false; + + // Check RHS + AstNode* const rhsp = skipConstSels(assignp->rhsp()); + // RHS is not a VarRef or Constant so might be expensive + if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false; + + // Otherwise it is a cheap assignment + return true; + } + // Others are not + return false; } // Predicate to check if an expression yields only 0 or 1 (i.e.: a 1-bit value) @@ -196,23 +577,21 @@ private: static AstNode* maskLsb(AstNode* nodep) { if (yieldsOneOrZero(nodep)) return nodep; // Otherwise apply masking - AstNode* const maskp = new AstConst(nodep->fileline(), AstConst::BitTrue()); + AstNode* const maskp = new AstConst{nodep->fileline(), AstConst::BitTrue()}; // Mask on left, as conventional - return new AstAnd(nodep->fileline(), maskp, nodep); + return new AstAnd{nodep->fileline(), maskp, nodep}; } - // Fold the RHS expression assuming the given condition state. Unlink bits - // from the RHS which is only used once, and can be reused. What remains - // of the RHS is expected to be deleted by the caller. + // Fold the RHS expression of an assignment assuming the given condition state. + // Unlink bits from the RHS which is only used once, and can be reused (is an unomdified + // sub-tree). What remains of the RHS is expected to be deleted by the caller. AstNode* foldAndUnlink(AstNode* rhsp, bool condTrue) { if (rhsp->sameTree(m_mgCondp)) { - return new AstConst(rhsp->fileline(), AstConst::BitTrue{}, condTrue); - } else if (const AstNodeCond* const condp = extractCond(rhsp)) { + return new AstConst{rhsp->fileline(), AstConst::BitTrue{}, condTrue}; + } else if (const AstNodeCond* const condp = extractCondFromRhs(rhsp)) { AstNode* const resp = condTrue ? condp->expr1p()->unlinkFrBack() : condp->expr2p()->unlinkFrBack(); - if (condp == rhsp) { // - return resp; - } + if (condp == rhsp) return resp; if (const AstAnd* const andp = VN_CAST(rhsp, And)) { UASSERT_OBJ(andp->rhsp() == condp, rhsp, "Should not try to fold this"); return new AstAnd{andp->fileline(), andp->lhsp()->cloneTree(false), resp}; @@ -227,17 +606,18 @@ private: return condTrue ? maskLsb(andp->lhsp()->unlinkFrBack()) : new AstConst{rhsp->fileline(), AstConst::BitFalse()}; } - } else if (VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef) || VN_IS(rhsp, Const)) { + } else if (VN_IS(rhsp, ArraySel) || VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef) + || VN_IS(rhsp, Const)) { return rhsp->cloneTree(false); } - rhsp->dumpTree("Don't know how to fold expression: "); - rhsp->v3fatalSrc("Don't know how to fold expression"); + // LCOV_EXCL_START + if (debug()) rhsp->dumpTree("Don't know how to fold expression: "); + rhsp->v3fatalSrc("Should not try to fold this during conditional merging"); + // LCOV_EXCL_STOP } - void mergeEnd(int lineno) { - UASSERT(m_mgFirstp, "mergeEnd without list " << lineno); - // We might want to recursively merge an AstIf. We stash it in this variable. - const AstNodeIf* recursivep = nullptr; + void mergeEnd() { + UASSERT(m_mgFirstp, "mergeEnd without list"); // Drop leading cheap nodes. These were only added in the hope of finding // an earlier reduced form, but we failed to do so. while (m_mgFirstp->user2() && m_mgFirstp != m_mgLastp) { @@ -254,8 +634,11 @@ private: m_mgLastp = m_mgLastp->backp(); --m_listLenght; UASSERT_OBJ(m_mgLastp && m_mgLastp->nextp() == nextp, m_mgFirstp, - "Cheap assignment should not be at the front of the list"); + "Cheap statement should not be at the front of the list"); } + // If the list contains a single AstNodeIf, we will want to merge its branches. + // If so, keep hold of the AstNodeIf in this variable. + AstNodeIf* recursivep = nullptr; // Merge if list is longer than one node if (m_mgFirstp != m_mgLastp) { UINFO(6, "MergeCond - First: " << m_mgFirstp << " Last: " << m_mgLastp << endl); @@ -266,7 +649,7 @@ private: // and we also need to keep track of it for comparisons later. m_mgCondp = m_mgCondp->cloneTree(false); // Create equivalent 'if' statement and insert it before the first node - AstIf* const resultp = new AstIf(m_mgCondp->fileline(), m_mgCondp); + AstIf* const resultp = new AstIf{m_mgCondp->fileline(), m_mgCondp}; m_mgFirstp->addHereThisAsNext(resultp); // Unzip the list and insert under branches AstNode* nextp = m_mgFirstp; @@ -308,10 +691,12 @@ private: VL_DO_DANGLING(ifp->deleteTree(), ifp); } } while (nextp); - // Recursively merge the resulting AstIf - recursivep = resultp; - } else if (const AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) { - // There was nothing to merge this AstNodeIf with, but try to merge it's branches + // Merge the branches of the resulting AstIf after re-analysis + if (resultp->ifsp()) m_workQueuep->push(resultp->ifsp()); + if (resultp->elsesp()) m_workQueuep->push(resultp->elsesp()); + } else if (AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) { + // There was nothing to merge this AstNodeIf with, so try to merge its branches. + // No re-analysis is required for this, so do it directly below recursivep = ifp; } // Reset state @@ -321,14 +706,13 @@ private: m_mgNextp = nullptr; AstNode::user1ClearTree(); // Clear marked variables AstNode::user2ClearTree(); - // Merge recursively within the branches + // Merge recursively within the branches of an un-merged AstNodeIF if (recursivep) { iterateAndNextNull(recursivep->ifsp()); - // Close list, if there is one at the end of the then branch - if (m_mgFirstp) mergeEnd(__LINE__); iterateAndNextNull(recursivep->elsesp()); - // Close list, if there is one at the end of the else branch - if (m_mgFirstp) mergeEnd(__LINE__); + // Close a pending merge to ensure merge state is + // reset as expected at the end of this function + if (m_mgFirstp) mergeEnd(); } } @@ -351,47 +735,16 @@ private: return false; } - // Check if this node is cheap enough that duplicating it in two branches of an - // AstIf and is hence not likely to cause a performance degradation if doing so. - bool isCheapNode(AstNode* nodep) const { - if (VN_IS(nodep, Comment)) return true; - if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) { - // Check LHS - AstNode* lhsp = assignp->lhsp(); - while (AstWordSel* const wselp = VN_CAST(lhsp, WordSel)) { - // WordSel index is not constant, so might be expensive - if (!VN_IS(wselp->bitp(), Const)) return false; - lhsp = wselp->fromp(); - } - // LHS is not a VarRef, so might be expensive - if (!VN_IS(lhsp, VarRef)) return false; - - // Check RHS - AstNode* rhsp = assignp->rhsp(); - while (AstWordSel* const wselp = VN_CAST(rhsp, WordSel)) { - // WordSel index is not constant, so might be expensive - if (!VN_IS(wselp->bitp(), Const)) return false; - rhsp = wselp->fromp(); - } - // RHS is not a VarRef or Constant so might be expensive - if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false; - - // Otherwise it is a cheap assignment - return true; - } - return false; - } - - bool addToList(AstNode* nodep, AstNode* condp, int line) { + bool addToList(AstNodeStmt* nodep, AstNode* condp) { // Set up head of new list if node is first in list if (!m_mgFirstp) { - UASSERT_OBJ(condp, nodep, "Cannot start new list without condition " << line); + UASSERT_OBJ(condp, nodep, "Cannot start new list without condition"); // Mark variable references in the condition condp->foreach([](const AstVarRef* nodep) { nodep->varp()->user1(1); }); // Now check again if mergeable. We need this to pick up assignments to conditions, // e.g.: 'c = c ? a : b' at the beginning of the list, which is in fact not mergeable // because it updates the condition. We simply bail on these. - if (m_checkMergeable(nodep) != Mergeable::YES) { + if ((*m_stmtPropertiesp)(nodep).writesConditionVar()) { // Clear marked variables AstNode::user1ClearTree(); // We did not add to the list @@ -400,11 +753,13 @@ private: m_mgFirstp = nodep; m_mgCondp = condp; m_listLenght = 0; - // Add any preceding nodes to the list that would allow us to extend the merge range - for (;;) { - AstNode* const backp = m_mgFirstp->backp(); + // Add any preceding nodes to the list that would allow us to extend the merge + // range + while (true) { + AstNodeStmt* const backp = VN_CAST(m_mgFirstp->backp(), NodeStmt); if (!backp || backp->nextp() != m_mgFirstp) break; // Don't move up the tree - if (m_checkMergeable(backp) != Mergeable::YES) break; + const StmtProperties& props = (*m_stmtPropertiesp)(backp); + if (props.m_isFence || props.writesConditionVar()) break; if (isSimplifiableNode(backp)) { ++m_listLenght; m_mgFirstp = backp; @@ -424,59 +779,53 @@ private: // Set up expected next node in list. m_mgNextp = nodep->nextp(); // If last under parent, done with current list - if (!m_mgNextp) mergeEnd(__LINE__); + if (!m_mgNextp) mergeEnd(); // We did add to the list return true; } // If this node is the next expected node and is helpful to add to the list, do so, // otherwise end the current merge. Return ture if added, false if ended merge. - bool addIfHelpfulElseEndMerge(AstNode* nodep) { + bool addIfHelpfulElseEndMerge(AstNodeStmt* nodep) { UASSERT_OBJ(m_mgFirstp, nodep, "List must be open"); if (m_mgNextp == nodep) { if (isSimplifiableNode(nodep)) { - if (addToList(nodep, nullptr, __LINE__)) return true; + if (addToList(nodep, nullptr)) return true; } else if (isCheapNode(nodep)) { nodep->user2(1); - if (addToList(nodep, nullptr, __LINE__)) return true; + if (addToList(nodep, nullptr)) return true; } } // Not added to list, so we are done with the current list - mergeEnd(__LINE__); + mergeEnd(); return false; } - bool checkOrMakeMergeable(AstNode* nodep) { - const Mergeable reason = m_checkMergeable(nodep); - // If meregeable, we are done - if (reason == Mergeable::YES) return true; - // Node not mergeable. - // If no current list, then this node is just special, move on. - if (!m_mgFirstp) return false; - // Otherwise finish current list - mergeEnd(__LINE__); - // If a tree was not mergeable due to an assignment to a condition, - // then finishing the current list makes it mergeable again. - return reason == Mergeable::NO_COND_ASSIGN; + bool checkOrMakeMergeable(const AstNodeStmt* nodep) { + const StmtProperties& props = (*m_stmtPropertiesp)(nodep); + if (props.m_isFence) return false; // Fence node never mergeable + // If the statement writes a condition variable of a pending merge, + // we must end the pending merge + if (m_mgFirstp && props.writesConditionVar()) mergeEnd(); + return true; // Now surely mergeable } - void mergeEndIfIncompatible(AstNode* nodep, AstNode* condp) { + void mergeEndIfIncompatible(const AstNode* nodep, const AstNode* condp) { if (m_mgFirstp && (m_mgNextp != nodep || !condp->sameTree(m_mgCondp))) { // Node in different list, or has different condition. Finish current list. - mergeEnd(__LINE__); + mergeEnd(); } } // VISITORS virtual void visit(AstNodeAssign* nodep) override { - AstNode* const rhsp = nodep->rhsp(); - if (const AstNodeCond* const condp = extractCond(rhsp)) { + if (AstNode* const condp = (*m_stmtPropertiesp)(nodep).m_condp) { // Check if mergeable if (!checkOrMakeMergeable(nodep)) return; // Close potentially incompatible pending merge - mergeEndIfIncompatible(nodep, condp->condp()); + mergeEndIfIncompatible(nodep, condp); // Add current node - addToList(nodep, condp->condp(), __LINE__); + addToList(nodep, condp); } else if (m_mgFirstp) { addIfHelpfulElseEndMerge(nodep); } @@ -493,21 +842,22 @@ private: // Close potentially incompatible pending merge mergeEndIfIncompatible(nodep, nodep->condp()); // Add current node - addToList(nodep, nodep->condp(), __LINE__); + addToList(nodep, nodep->condp()); + } + + virtual void visit(AstNodeStmt* nodep) override { + if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return; + iterateChildren(nodep); + } + + virtual void visit(AstCFunc* nodep) override { + // Merge function body + if (nodep->stmtsp()) process(nodep->stmtsp()); } // For speed, only iterate what is necessary. virtual void visit(AstNetlist* nodep) override { iterateAndNextNull(nodep->modulesp()); } virtual void visit(AstNodeModule* nodep) override { iterateAndNextNull(nodep->stmtsp()); } - virtual void visit(AstCFunc* nodep) override { - iterateChildren(nodep); - // Close list, if there is one at the end of the function - if (m_mgFirstp) mergeEnd(__LINE__); - } - virtual void visit(AstNodeStmt* nodep) override { - if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return; - iterateChildren(nodep); - } virtual void visit(AstNode* nodep) override {} public: @@ -520,6 +870,8 @@ public: } }; +} // namespace + //###################################################################### // MergeConditionals class functions diff --git a/src/V3OptionParser.cpp b/src/V3OptionParser.cpp index 4439ba53d..d98b4fd90 100644 --- a/src/V3OptionParser.cpp +++ b/src/V3OptionParser.cpp @@ -30,6 +30,7 @@ struct V3OptionParser::Impl { // Setting for isOnOffAllowed() and isPartialMatchAllowed() enum class en : uint8_t { NONE, // "-opt" + FONOFF, // "-fopt" and "-fno-opt" ONOFF, // "-opt" and "-no-opt" VALUE // "-opt val" }; @@ -39,6 +40,7 @@ struct V3OptionParser::Impl { bool m_undocumented = false; // This option is not documented public: virtual bool isValueNeeded() const override final { return MODE == en::VALUE; } + virtual bool isFOnOffAllowed() const override final { return MODE == en::FONOFF; } virtual bool isOnOffAllowed() const override final { return MODE == en::ONOFF; } virtual bool isPartialMatchAllowed() const override final { return ALLOW_PARTIAL_MATCH; } virtual bool isUndocumented() const override { return m_undocumented; } @@ -47,6 +49,7 @@ struct V3OptionParser::Impl { // Actual action classes template class ActionSet; // "-opt" for bool-ish, "-opt val" for int and string + template class ActionFOnOff; // "-fopt" and "-fno-opt" for bool-ish template class ActionOnOff; // "-opt" and "-no-opt" for bool-ish class ActionCbCall; // Callback without argument for "-opt" class ActionCbOnOff; // Callback for "-opt" and "-no-opt" @@ -80,6 +83,7 @@ V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, VOptionBool, m_valp->setTrueOrFalse(tru V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, int, *m_valp = std::atoi(argp), en::VALUE); V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, string, *m_valp = argp, en::VALUE); +V3OPTION_PARSER_DEF_ACT_CLASS(ActionFOnOff, bool, *m_valp = !hasPrefixFNo(optp), en::FONOFF); V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, bool, *m_valp = !hasPrefixNo(optp), en::ONOFF); #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, VOptionBool, m_valp->setTrueOrFalse(!hasPrefixNo(optp)), @@ -117,12 +121,23 @@ V3OPTION_PARSER_DEF_ACT_CB_CLASS(ActionCbPartialMatchVal, void(const char*, cons V3OptionParser::ActionIfs* V3OptionParser::find(const char* optp) { const auto it = m_pimpl->m_options.find(optp); - if (it != m_pimpl->m_options.end()) return it->second.get(); + if (it != m_pimpl->m_options.end()) return it->second.get(); // Exact match for (auto&& act : m_pimpl->m_options) { + if (act.second->isFOnOffAllowed()) { // Find starts with "-fno" + if (const char* const nop + = VString::startsWith(optp, "-fno-") ? (optp + strlen("-fno-")) : nullptr) { + if (act.first.substr(strlen("-f"), std::string::npos) + == nop) { // [-f]opt = [-fno-]opt + return act.second.get(); + } + } + } if (act.second->isOnOffAllowed()) { // Find starts with "-no" - const char* const nop = VString::startsWith(optp, "-no") ? (optp + 3) : nullptr; - if (nop && (act.first == nop || act.first == (string{"-"} + nop))) { - return act.second.get(); + if (const char* const nop + = VString::startsWith(optp, "-no") ? (optp + strlen("-no")) : nullptr) { + if (act.first == nop || act.first == (string{"-"} + nop)) { + return act.second.get(); + } } } else if (act.second->isPartialMatchAllowed()) { if (VString::startsWith(optp, act.first)) return act.second.get(); @@ -143,6 +158,12 @@ V3OptionParser::ActionIfs& V3OptionParser::add(const std::string& opt, ARG arg) return *insertedResult.first->second; } +bool V3OptionParser::hasPrefixFNo(const char* strp) { + UASSERT(strp[0] == '-', strp << " does not start with '-'"); + if (strp[1] == '-') ++strp; + return VString::startsWith(strp, "-fno"); +} + bool V3OptionParser::hasPrefixNo(const char* strp) { UASSERT(strp[0] == '-', strp << " does not start with '-'"); if (strp[1] == '-') ++strp; @@ -178,6 +199,10 @@ void V3OptionParser::finalize() { for (auto&& opt : m_pimpl->m_options) { if (opt.second->isUndocumented()) continue; m_pimpl->m_spellCheck.pushCandidate(opt.first); + if (opt.second->isFOnOffAllowed()) { + m_pimpl->m_spellCheck.pushCandidate( + "-fno-" + opt.first.substr(strlen("-f"), std::string::npos)); + } if (opt.second->isOnOffAllowed()) m_pimpl->m_spellCheck.pushCandidate("-no" + opt.first); } m_pimpl->m_isFinalized = true; @@ -202,6 +227,7 @@ V3OPTION_PARSER_DEF_OP(Set, VOptionBool*, ActionSet) #endif V3OPTION_PARSER_DEF_OP(Set, int*, ActionSet) V3OPTION_PARSER_DEF_OP(Set, string*, ActionSet) +V3OPTION_PARSER_DEF_OP(FOnOff, bool*, ActionFOnOff) V3OPTION_PARSER_DEF_OP(OnOff, bool*, ActionOnOff) #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL V3OPTION_PARSER_DEF_OP(OnOff, VOptionBool*, ActionOnOff) diff --git a/src/V3OptionParser.h b/src/V3OptionParser.h index fc199264f..e77f43a26 100644 --- a/src/V3OptionParser.h +++ b/src/V3OptionParser.h @@ -66,6 +66,7 @@ private: // METHODS ActionIfs* find(const char* optp); template ActionIfs& add(const string& opt, ARG arg); + static bool hasPrefixFNo(const char* strp); // Returns true if strp starts with "-fno" static bool hasPrefixNo(const char* strp); // Returns true if strp starts with "-no" public: @@ -87,6 +88,7 @@ class V3OptionParser::ActionIfs VL_NOT_FINAL { public: virtual ~ActionIfs() = default; virtual bool isValueNeeded() const = 0; // Need val of "-opt val" + virtual bool isFOnOffAllowed() const = 0; // true if "-fno-opt" is allowd virtual bool isOnOffAllowed() const = 0; // true if "-no-opt" is allowd virtual bool isPartialMatchAllowed() const = 0; // true if "-Wno-" matches "-Wno-fatal" virtual bool isUndocumented() const = 0; // Will not be suggested in typo @@ -101,13 +103,15 @@ class V3OptionParser::AppendHelper final { public: // TYPES // Tag to specify which operator() to call - struct Set {}; // For ActionSet + struct FOnOff {}; // For ActionFOnOff struct OnOff {}; // For ActionOnOff + struct Set {}; // For ActionSet + struct CbCall {}; // For ActionCbCall - struct CbOnOff {}; // For ActionOnOff - struct CbVal {}; // For ActionCbVal + struct CbOnOff {}; // For ActionOnOff of ActionFOnOff struct CbPartialMatch {}; // For ActionCbPartialMatch struct CbPartialMatchVal {}; // For ActionCbPartialMatchVal + struct CbVal {}; // For ActionCbVal private: // MEMBERS @@ -122,6 +126,7 @@ public: ActionIfs& operator()(const char* optp, Set, int*) const; ActionIfs& operator()(const char* optp, Set, string*) const; + ActionIfs& operator()(const char* optp, FOnOff, bool*) const; ActionIfs& operator()(const char* optp, OnOff, bool*) const; #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL ActionIfs& operator()(const char* optp, OnOff, VOptionBool*) const; @@ -144,13 +149,14 @@ public: #define V3OPTION_PARSER_DECL_TAGS \ const auto Set VL_ATTR_UNUSED = V3OptionParser::AppendHelper::Set{}; \ + const auto FOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::FOnOff{}; \ const auto OnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::OnOff{}; \ const auto CbCall VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbCall{}; \ const auto CbOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbOnOff{}; \ - const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{}; \ const auto CbPartialMatch VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbPartialMatch{}; \ const auto CbPartialMatchVal VL_ATTR_UNUSED \ - = V3OptionParser::AppendHelper::CbPartialMatchVal {} + = V3OptionParser::AppendHelper::CbPartialMatchVal{}; \ + const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{}; //###################################################################### diff --git a/src/V3Options.cpp b/src/V3Options.cpp index 88e1b4d31..b3abe29f4 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -775,8 +775,16 @@ void V3Options::notify() { && !v3Global.opt.xmlOnly()); } - // --trace-threads implies --threads 1 unless explicitly specified - if (traceThreads() && !threads()) m_threads = 1; + if (trace()) { + // With --trace-fst, --trace-threads implies --threads 1 unless explicitly specified + if (traceFormat().fst() && traceThreads() && !threads()) m_threads = 1; + + // With --trace, --trace-threads is ignored + if (traceFormat().vcd()) m_traceThreads = threads() ? 1 : 0; + } + + UASSERT(!(useTraceParallel() && useTraceOffload()), + "Cannot use both parallel and offloaded tracing"); // Default split limits if not specified if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit; @@ -1075,6 +1083,28 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char }); DECL_OPTION("-flatten", OnOff, &m_flatten); + DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp); + DECL_OPTION("-fassemble", FOnOff, &m_fAssemble); + DECL_OPTION("-fcase", FOnOff, &m_fCase); + DECL_OPTION("-fcombine", FOnOff, &m_fCombine); + DECL_OPTION("-fconst", FOnOff, &m_fConst); + DECL_OPTION("-fconst-bit-op-tree", FOnOff, &m_fConstBitOpTree); + DECL_OPTION("-fdedup", FOnOff, &m_fDedupe); + DECL_OPTION("-fexpand", FOnOff, &m_fExpand); + DECL_OPTION("-fgate", FOnOff, &m_fGate); + DECL_OPTION("-finline", FOnOff, &m_fInline); + DECL_OPTION("-flife", FOnOff, &m_fLife); + DECL_OPTION("-flife-post", FOnOff, &m_fLifePost); + DECL_OPTION("-flocalize", FOnOff, &m_fLocalize); + DECL_OPTION("-fmerge-cond", FOnOff, &m_fMergeCond); + DECL_OPTION("-fmerge-const-pool", FOnOff, &m_fMergeConstPool); + DECL_OPTION("-freloop", FOnOff, &m_fReloop); + DECL_OPTION("-freorder", FOnOff, &m_fReorder); + DECL_OPTION("-fsplit", FOnOff, &m_fSplit); + DECL_OPTION("-fsubst", FOnOff, &m_fSubst); + DECL_OPTION("-fsubst-const", FOnOff, &m_fSubstConst); + DECL_OPTION("-ftable", FOnOff, &m_fTable); + DECL_OPTION("-G", CbPartialMatch, [this](const char* optp) { addParameter(optp, false); }); DECL_OPTION("-gate-stmts", Set, &m_gateStmts); DECL_OPTION("-gdb", CbCall, []() {}); // Processed only in bin/verilator shell @@ -1144,50 +1174,51 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char } }); DECL_OPTION("-max-num-width", Set, &m_maxNumWidth); - DECL_OPTION("-merge-const-pool", OnOff, &m_mergeConstPool); DECL_OPTION("-mod-prefix", Set, &m_modPrefix); - DECL_OPTION("-O", CbPartialMatch, [this](const char* optp) { - // Optimization + DECL_OPTION("-O0", CbCall, [this]() { optimize(0); }); + DECL_OPTION("-O1", CbCall, [this]() { optimize(1); }); + DECL_OPTION("-O2", CbCall, [this]() { optimize(2); }); + DECL_OPTION("-O3", CbCall, [this]() { optimize(3); }); + + DECL_OPTION("-O", CbPartialMatch, [this, fl](const char* optp) { + // Optimization, e.g. -O1rX + // LCOV_EXCL_START + fl->v3warn(DEPRECATED, "Option -O is deprecated. " + "Use -f or -fno- instead."); for (const char* cp = optp; *cp; ++cp) { const bool flag = isupper(*cp); switch (tolower(*cp)) { - case '0': optimize(0); break; // 0=all off - case '1': optimize(1); break; // 1=all on - case '2': optimize(2); break; // 2=not used - case '3': optimize(3); break; // 3=high - case 'a': m_oTable = flag; break; - case 'b': m_oCombine = flag; break; - case 'c': m_oConst = flag; break; - case 'd': m_oDedupe = flag; break; - case 'e': m_oCase = flag; break; - // f - case 'g': m_oGate = flag; break; - // h - case 'i': m_oInline = flag; break; - // j - case 'k': m_oSubstConst = flag; break; - case 'l': m_oLife = flag; break; - case 'm': m_oAssemble = flag; break; - // n - case 'o': - m_oConstBitOpTree = flag; - break; // Can remove ~2022-01 when stable - // o will be used as an escape for a second character of optimization disables + case '0': optimize(0); break; + case '1': optimize(1); break; + case '2': optimize(2); break; + case '3': optimize(3); break; + case 'a': m_fTable = flag; break; // == -fno-table + case 'b': m_fCombine = flag; break; // == -fno-combine + case 'c': m_fConst = flag; break; // == -fno-const + case 'd': m_fDedupe = flag; break; // == -fno-dedup + case 'e': m_fCase = flag; break; // == -fno-case + case 'g': m_fGate = flag; break; // == -fno-gate + case 'i': m_fInline = flag; break; // == -fno-inline + case 'k': m_fSubstConst = flag; break; // == -fno-subst-const + case 'l': m_fLife = flag; break; // == -fno-life + case 'm': m_fAssemble = flag; break; // == -fno-assemble + case 'o': m_fConstBitOpTree = flag; break; // == -fno-const-bit-op-tree case 'p': m_public = !flag; break; // With -Op so flag=0, we want public on so few optimizations done - // q - case 'r': m_oReorder = flag; break; - case 's': m_oSplit = flag; break; - case 't': m_oLifePost = flag; break; - case 'u': m_oSubst = flag; break; - case 'v': m_oReloop = flag; break; - case 'w': m_oMergeCond = flag; break; - case 'x': m_oExpand = flag; break; - case 'y': m_oAcycSimp = flag; break; - case 'z': m_oLocalize = flag; break; - default: break; // No error, just ignore + case 'r': m_fReorder = flag; break; // == -fno-reorder + case 's': m_fSplit = flag; break; // == -fno-split + case 't': m_fLifePost = flag; break; // == -fno-life-post + case 'u': m_fSubst = flag; break; // == -fno-subst + case 'v': m_fReloop = flag; break; // == -fno-reloop + case 'w': m_fMergeCond = flag; break; // == -fno-merge-cond + case 'x': m_fExpand = flag; break; // == -fno-expand + case 'y': m_fAcycSimp = flag; break; // == -fno-acyc-simp + case 'z': m_fLocalize = flag; break; // == -fno-localize + default: + break; // No error, just ignore + // LCOV_EXCL_STOP } } }); @@ -1352,7 +1383,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char DECL_OPTION("-trace-threads", CbVal, [this, fl](const char* valp) { m_trace = true; m_traceThreads = std::atoi(valp); - if (m_traceThreads < 0) fl->v3fatal("--trace-threads must be >= 0: " << valp); + if (m_traceThreads < 1) fl->v3fatal("--trace-threads must be >= 1: " << valp); }); DECL_OPTION("-trace-underscore", OnOff, &m_traceUnderscore); @@ -1781,26 +1812,26 @@ int V3Options::dumpTreeLevel(const string& srcfile_path) { void V3Options::optimize(int level) { // Set all optimizations to on/off const bool flag = level > 0; - m_oAcycSimp = flag; - m_oAssemble = flag; - m_oCase = flag; - m_oCombine = flag; - m_oConst = flag; - m_oConstBitOpTree = flag; - m_oDedupe = flag; - m_oExpand = flag; - m_oGate = flag; - m_oInline = flag; - m_oLife = flag; - m_oLifePost = flag; - m_oLocalize = flag; - m_oMergeCond = flag; - m_oReloop = flag; - m_oReorder = flag; - m_oSplit = flag; - m_oSubst = flag; - m_oSubstConst = flag; - m_oTable = flag; + m_fAcycSimp = flag; + m_fAssemble = flag; + m_fCase = flag; + m_fCombine = flag; + m_fConst = flag; + m_fConstBitOpTree = flag; + m_fDedupe = flag; + m_fExpand = flag; + m_fGate = flag; + m_fInline = flag; + m_fLife = flag; + m_fLifePost = flag; + m_fLocalize = flag; + m_fMergeCond = flag; + m_fReloop = flag; + m_fReorder = flag; + m_fSplit = flag; + m_fSubst = flag; + m_fSubstConst = flag; + m_fTable = flag; // And set specific optimization levels if (level >= 3) { m_inlineMult = -1; // Maximum inlining diff --git a/src/V3Options.h b/src/V3Options.h index dd71a1b3b..f5ed6df29 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -246,7 +246,6 @@ private: bool m_lintOnly = false; // main switch: --lint-only bool m_gmake = false; // main switch: --make gmake bool m_main = false; // main swithc: --main - bool m_mergeConstPool = true; // main switch: --merge-const-pool bool m_outFormatOk = false; // main switch: --cc, --sc or --sp was specified bool m_pedantic = false; // main switch: --Wpedantic bool m_pinsScUint = false; // main switch: --pins-sc-uint @@ -340,27 +339,27 @@ private: V3LangCode m_defaultLanguage; // main switch: --language // MEMBERS (optimizations) - // // main switch: -Op: --public - bool m_oAcycSimp; // main switch: -Oy: acyclic pre-optimizations - bool m_oAssemble; // main switch: -Om: assign assemble - bool m_oCase; // main switch: -Oe: case tree conversion - bool m_oCombine; // main switch: -Ob: common icode packing - bool m_oConst; // main switch: -Oc: constant folding - bool m_oConstBitOpTree; // main switch: -Oo: constant bit op tree - bool m_oDedupe; // main switch: -Od: logic deduplication - bool m_oExpand; // main switch: -Ox: expansion of C macros - bool m_oGate; // main switch: -Og: gate wire elimination - bool m_oInline; // main switch: -Oi: module inlining - bool m_oLife; // main switch: -Ol: variable lifetime - bool m_oLifePost; // main switch: -Ot: delayed assignment elimination - bool m_oLocalize; // main switch: -Oz: convert temps to local variables - bool m_oMergeCond; // main switch: -Ob: merge conditionals - bool m_oReloop; // main switch: -Ov: reform loops - bool m_oReorder; // main switch: -Or: reorder assignments in blocks - bool m_oSplit; // main switch: -Os: always assignment splitting - bool m_oSubst; // main switch: -Ou: substitute expression temp values - bool m_oSubstConst; // main switch: -Ok: final constant substitution - bool m_oTable; // main switch: -Oa: lookup table creation + bool m_fAcycSimp; // main switch: -fno-acyc-simp: acyclic pre-optimizations + bool m_fAssemble; // main switch: -fno-assemble: assign assemble + bool m_fCase; // main switch: -fno-case: case tree conversion + bool m_fCombine; // main switch: -fno-combine: common icode packing + bool m_fConst; // main switch: -fno-const: constant folding + bool m_fConstBitOpTree; // main switch: -fno-const-bit-op-tree constant bit op tree + bool m_fDedupe; // main switch: -fno-dedupe: logic deduplication + bool m_fExpand; // main switch: -fno-expand: expansion of C macros + bool m_fGate; // main switch: -fno-gate: gate wire elimination + bool m_fInline; // main switch: -fno-inline: module inlining + bool m_fLife; // main switch: -fno-life: variable lifetime + bool m_fLifePost; // main switch: -fno-life-post: delayed assignment elimination + bool m_fLocalize; // main switch: -fno-localize: convert temps to local variables + bool m_fMergeCond; // main switch: -fno-merge-cond: merge conditionals + bool m_fMergeConstPool = true; // main switch: --fmerge-const-pool + bool m_fReloop; // main switch: -fno-reloop: reform loops + bool m_fReorder; // main switch: -fno-reorder: reorder assignments in blocks + bool m_fSplit; // main switch: -fno-split: always assignment splitting + bool m_fSubst; // main switch: -fno-subst: substitute expression temp values + bool m_fSubstConst; // main switch: -fno-subst-const: final constant substitution + bool m_fTable; // main switch: -fno-table: lookup table creation // clang-format on bool m_available = false; // Set to true at the end of option parsing @@ -458,7 +457,6 @@ public: bool traceStructs() const { return m_traceStructs; } bool traceUnderscore() const { return m_traceUnderscore; } bool main() const { return m_main; } - bool mergeConstPool() const { return m_mergeConstPool; } bool outFormatOk() const { return m_outFormatOk; } bool keepTempFiles() const { return (V3Error::debugDefault() != 0); } bool pedantic() const { return m_pedantic; } @@ -516,8 +514,10 @@ public: int traceMaxArray() const { return m_traceMaxArray; } int traceMaxWidth() const { return m_traceMaxWidth; } int traceThreads() const { return m_traceThreads; } - bool useTraceOffloadThread() const { - return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst(); + bool useTraceOffload() const { return trace() && traceFormat().fst() && traceThreads() > 1; } + bool useTraceParallel() const { return trace() && traceFormat().vcd() && threads() > 1; } + unsigned vmTraceThreads() const { + return useTraceParallel() ? threads() : useTraceOffload() ? 1 : 0; } int unrollCount() const { return m_unrollCount; } int unrollStmts() const { return m_unrollStmts; } @@ -571,26 +571,27 @@ public: bool isNoClocker(const string& signame) const; // ACCESSORS (optimization options) - bool oAcycSimp() const { return m_oAcycSimp; } - bool oAssemble() const { return m_oAssemble; } - bool oCase() const { return m_oCase; } - bool oCombine() const { return m_oCombine; } - bool oConst() const { return m_oConst; } - bool oConstBitOpTree() const { return m_oConstBitOpTree; } - bool oDedupe() const { return m_oDedupe; } - bool oExpand() const { return m_oExpand; } - bool oGate() const { return m_oGate; } - bool oInline() const { return m_oInline; } - bool oLife() const { return m_oLife; } - bool oLifePost() const { return m_oLifePost; } - bool oLocalize() const { return m_oLocalize; } - bool oMergeCond() const { return m_oMergeCond; } - bool oReloop() const { return m_oReloop; } - bool oReorder() const { return m_oReorder; } - bool oSplit() const { return m_oSplit; } - bool oSubst() const { return m_oSubst; } - bool oSubstConst() const { return m_oSubstConst; } - bool oTable() const { return m_oTable; } + bool fAcycSimp() const { return m_fAcycSimp; } + bool fAssemble() const { return m_fAssemble; } + bool fCase() const { return m_fCase; } + bool fCombine() const { return m_fCombine; } + bool fConst() const { return m_fConst; } + bool fConstBitOpTree() const { return m_fConstBitOpTree; } + bool fDedupe() const { return m_fDedupe; } + bool fExpand() const { return m_fExpand; } + bool fGate() const { return m_fGate; } + bool fInline() const { return m_fInline; } + bool fLife() const { return m_fLife; } + bool fLifePost() const { return m_fLifePost; } + bool fLocalize() const { return m_fLocalize; } + bool fMergeCond() const { return m_fMergeCond; } + bool fMergeConstPool() const { return m_fMergeConstPool; } + bool fReloop() const { return m_fReloop; } + bool fReorder() const { return m_fReorder; } + bool fSplit() const { return m_fSplit; } + bool fSubst() const { return m_fSubst; } + bool fSubstConst() const { return m_fSubstConst; } + bool fTable() const { return m_fTable; } string traceClassBase() const { return m_traceFormat.classBase(); } string traceClassLang() const { return m_traceFormat.classBase() + (systemC() ? "Sc" : "C"); } diff --git a/src/V3Premit.cpp b/src/V3Premit.cpp index 7501cd456..836b7c814 100644 --- a/src/V3Premit.cpp +++ b/src/V3Premit.cpp @@ -133,7 +133,7 @@ private: && !constp->num().isString(); // Not a string if (useConstPool) { // Extract into constant pool. - const bool merge = v3Global.opt.mergeConstPool(); + const bool merge = v3Global.opt.fMergeConstPool(); varp = v3Global.rootp()->constPoolp()->findConst(constp, merge)->varp(); nodep->deleteTree(); ++m_extractedToConstPool; diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp index 61d009b6f..9fa1b099a 100644 --- a/src/V3Trace.cpp +++ b/src/V3Trace.cpp @@ -180,6 +180,10 @@ private: TraceActivityVertex* const m_alwaysVtxp; // "Always trace" vertex bool m_finding = false; // Pass one of algorithm? + // Trace parallelism. Only VCD tracing can be parallelized at this time. + const uint32_t m_parallelism + = v3Global.opt.useTraceParallel() ? static_cast(v3Global.opt.threads()) : 1; + VDouble0 m_statUniqSigs; // Statistic tracking VDouble0 m_statUniqCodes; // Statistic tracking @@ -388,7 +392,7 @@ private: if (!it->second->duplicatep()) { uint32_t cost = 0; const AstTraceDecl* const declp = it->second->nodep(); - // The number of comparisons required by tracep->chg* + // The number of comparisons required by bufp->chg* cost += declp->isWide() ? declp->codeInc() : 1; // Arrays are traced by element cost *= declp->arrayRange().ranged() ? declp->arrayRange().elements() : 1; @@ -494,7 +498,7 @@ private: }; if (isTopFunc) { // Top functions - funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "* tracep"); + funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "::Buffer* bufp"); addInitStr(voidSelfAssign(m_topModp)); addInitStr(symClassAssign()); // Add global activity check to change dump functions @@ -508,32 +512,33 @@ private: m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true)); } m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp)); - m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf);\n", true)); + const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"}; + m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true)); } else { // Sub functions - funcp->argTypes(v3Global.opt.traceClassBase() + "* tracep"); + funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp"); // Setup base references. Note in rare occasions we can end up with an empty trace // sub function, hence the VL_ATTR_UNUSED attributes. if (full) { // Full dump sub function addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = " - "tracep->oldp(vlSymsp->__Vm_baseCode);\n"); + "bufp->oldp(vlSymsp->__Vm_baseCode);\n"); } else { // Change dump sub function - if (v3Global.opt.useTraceOffloadThread()) { + if (v3Global.opt.useTraceOffload()) { addInitStr("const uint32_t base VL_ATTR_UNUSED = " "vlSymsp->__Vm_baseCode + " + cvtToStr(baseCode) + ";\n"); - addInitStr("if (false && tracep) {} // Prevent unused\n"); + addInitStr("if (false && bufp) {} // Prevent unused\n"); } else { addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = " - "tracep->oldp(vlSymsp->__Vm_baseCode + " + "bufp->oldp(vlSymsp->__Vm_baseCode + " + cvtToStr(baseCode) + ");\n"); } } // Add call to top function AstCCall* const callp = new AstCCall(funcp->fileline(), funcp); - callp->argTypes("tracep"); + callp->argTypes("bufp"); topFuncp->addStmtsp(callp); } // Done @@ -728,7 +733,7 @@ private: // We will split functions such that each have to dump roughly the same amount of data // for this we need to keep tack of the number of codes used by the trace functions. uint32_t nFullCodes = 0; // Number of non-duplicate codes (need to go into full* dump) - uint32_t nChgCodes = 0; // Number of non-consant codes (need to go in to chg* dump) + uint32_t nChgCodes = 0; // Number of non-constant codes (need to go in to chg* dump) sortTraces(traces, nFullCodes, nChgCodes); UINFO(5, "nFullCodes: " << nFullCodes << " nChgCodes: " << nChgCodes << endl); @@ -747,13 +752,11 @@ private: m_regFuncp->isLoose(true); m_topScopep->addActivep(m_regFuncp); - const int parallelism = 1; // Note: will bump this later, code below works for any value - // Create the full dump functions, also allocates signal numbers - createFullTraceFunction(traces, nFullCodes, parallelism); + createFullTraceFunction(traces, nFullCodes, m_parallelism); // Create the incremental dump functions - createChgTraceFunctions(traces, nChgCodes, parallelism); + createChgTraceFunctions(traces, nChgCodes, m_parallelism); // Remove refs to traced values from TraceDecl nodes, these have now moved under // TraceInc diff --git a/src/V3Width.cpp b/src/V3Width.cpp index 6a76170a1..c35b4270c 100644 --- a/src/V3Width.cpp +++ b/src/V3Width.cpp @@ -504,6 +504,7 @@ private: // width: LHS + RHS AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp(); userIterate(vdtypep, WidthVP(SELF, BOTH).p()); + // Conversions if (VN_IS(vdtypep, QueueDType)) { // Queue "element 0" is lhsp, so we need to swap arguments auto* const newp = new AstConsQueue(nodep->fileline(), nodep->rhsp()->unlinkFrBack(), @@ -521,6 +522,16 @@ private: userIterateChildren(newp, m_vup); return; } + if (VN_IS(vdtypep, UnpackArrayDType)) { + auto* const newp = new AstPattern{nodep->fileline(), nullptr}; + patConcatConvertRecurse(newp, nodep); + nodep->replaceWith(newp); + VL_DO_DANGLING(pushDeletep(nodep), nodep); + userIterate(newp, m_vup); + return; + } + + // Concat handling if (m_vup->prelim()) { if (VN_IS(vdtypep, AssocArrayDType) // || VN_IS(vdtypep, DynArrayDType) // @@ -662,7 +673,8 @@ private: } AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp(); - if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)) { + if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType) + || VN_IS(vdtypep, UnpackArrayDType)) { if (times != 1) nodep->v3warn(E_UNSUPPORTED, "Unsupported: Non-1 replication to form " << vdtypep->prettyDTypeNameQ() @@ -674,7 +686,7 @@ private: VL_DO_DANGLING(pushDeletep(nodep), nodep); return; } - if (VN_IS(vdtypep, AssocArrayDType) || VN_IS(vdtypep, UnpackArrayDType)) { + if (VN_IS(vdtypep, AssocArrayDType)) { nodep->v3warn(E_UNSUPPORTED, "Unsupported: Replication to form " << vdtypep->prettyDTypeNameQ() << " data type"); } @@ -6236,6 +6248,21 @@ private: return patmap; } + void patConcatConvertRecurse(AstPattern* patternp, AstConcat* nodep) { + if (AstConcat* lhsp = VN_CAST(nodep->lhsp(), Concat)) { + patConcatConvertRecurse(patternp, lhsp); + } else { + patternp->addItemsp(new AstPatMember{nodep->lhsp()->fileline(), + nodep->lhsp()->unlinkFrBack(), nullptr, nullptr}); + } + if (AstConcat* rhsp = VN_CAST(nodep->rhsp(), Concat)) { + patConcatConvertRecurse(patternp, rhsp); + } else { + patternp->addItemsp(new AstPatMember{nodep->rhsp()->fileline(), + nodep->rhsp()->unlinkFrBack(), nullptr, nullptr}); + } + } + void makeOpenArrayShell(AstNodeFTaskRef* nodep) { UINFO(4, "Replicate openarray function " << nodep->taskp() << endl); AstNodeFTask* const oldTaskp = nodep->taskp(); diff --git a/src/Verilator.cpp b/src/Verilator.cpp index cabbb37b2..2d37511f1 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -237,7 +237,7 @@ static void process() { // Module inlining // Cannot remove dead variables after this, as alias information for final // V3Scope's V3LinkDot is in the AstVar. - if (v3Global.opt.oInline()) { + if (v3Global.opt.fInline()) { V3Inline::inlineAll(v3Global.rootp()); V3LinkDot::linkDotArrayed(v3Global.rootp()); // Cleanup as made new modules } @@ -308,11 +308,11 @@ static void process() { // Push constants across variables and remove redundant assignments V3Const::constifyAll(v3Global.rootp()); - if (v3Global.opt.oLife()) V3Life::lifeAll(v3Global.rootp()); + if (v3Global.opt.fLife()) V3Life::lifeAll(v3Global.rootp()); // Make large low-fanin logic blocks into lookup tables // This should probably be done much later, once we have common logic elimination. - if (!v3Global.opt.lintOnly() && v3Global.opt.oTable()) { + if (!v3Global.opt.lintOnly() && v3Global.opt.fTable()) { V3Table::tableAll(v3Global.rootp()); } @@ -326,7 +326,7 @@ static void process() { V3Active::activeAll(v3Global.rootp()); // Split single ALWAYS blocks into multiple blocks for better ordering chances - if (v3Global.opt.oSplit()) V3Split::splitAlwaysAll(v3Global.rootp()); + if (v3Global.opt.fSplit()) V3Split::splitAlwaysAll(v3Global.rootp()); V3SplitAs::splitAsAll(v3Global.rootp()); // Create tracing sample points, before we start eliminating signals @@ -338,11 +338,11 @@ static void process() { // Gate-based logic elimination; eliminate signals and push constant across cell boundaries // Instant propagation makes lots-o-constant reduction possibilities. - if (v3Global.opt.oGate()) { + if (v3Global.opt.fGate()) { V3Gate::gateAll(v3Global.rootp()); // V3Gate calls constant propagation itself. } else { - v3info("Command Line disabled gate optimization with -Og/-O0. " + v3info("Command Line disabled gate optimization with -fno-gate. " "This may cause ordering problems."); } @@ -361,7 +361,7 @@ static void process() { } // Reorder assignments in pipelined blocks - if (v3Global.opt.oReorder()) V3Split::splitReorderAll(v3Global.rootp()); + if (v3Global.opt.fReorder()) V3Split::splitReorderAll(v3Global.rootp()); // Create delayed assignments // This creates lots of duplicate ACTIVES so ActiveTop needs to be after this step @@ -383,12 +383,12 @@ static void process() { // Cleanup any dly vars or other temps that are simple assignments // Life must be done before Subst, as it assumes each CFunc under // _eval is called only once. - if (v3Global.opt.oLife()) { + if (v3Global.opt.fLife()) { V3Const::constifyAll(v3Global.rootp()); V3Life::lifeAll(v3Global.rootp()); } - if (v3Global.opt.oLifePost()) V3LifePost::lifepostAll(v3Global.rootp()); + if (v3Global.opt.fLifePost()) V3LifePost::lifepostAll(v3Global.rootp()); // Remove unused vars V3Const::constifyAll(v3Global.rootp()); @@ -415,13 +415,13 @@ static void process() { v3Global.assertScoped(false); // Move variables from modules to function local variables where possible - if (v3Global.opt.oLocalize()) V3Localize::localizeAll(v3Global.rootp()); + if (v3Global.opt.fLocalize()) V3Localize::localizeAll(v3Global.rootp()); // Remove remaining scopes; make varrefs/funccalls relative to current module V3Descope::descopeAll(v3Global.rootp()); // Icache packing; combine common code in each module's functions into subroutines - if (v3Global.opt.oCombine()) V3Combine::combineAll(v3Global.rootp()); + if (v3Global.opt.fCombine()) V3Combine::combineAll(v3Global.rootp()); } V3Error::abortIfErrors(); @@ -445,30 +445,30 @@ static void process() { } // Expand macros and wide operators into C++ primitives - if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.oExpand()) { + if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.fExpand()) { V3Expand::expandAll(v3Global.rootp()); } // Propagate constants across WORDSEL arrayed temporaries - if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubst()) { + if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubst()) { // Constant folding of expanded stuff V3Const::constifyCpp(v3Global.rootp()); V3Subst::substituteAll(v3Global.rootp()); } - if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubstConst()) { + if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubstConst()) { // Constant folding of substitutions V3Const::constifyCpp(v3Global.rootp()); V3Dead::deadifyAll(v3Global.rootp()); } if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly()) { - if (v3Global.opt.oMergeCond()) { + if (v3Global.opt.fMergeCond()) { // Merge conditionals V3MergeCond::mergeAll(v3Global.rootp()); } - if (v3Global.opt.oReloop()) { + if (v3Global.opt.fReloop()) { // Reform loops to reduce code size // Must be after all Sel/array index based optimizations V3Reloop::reloopAll(v3Global.rootp()); diff --git a/test_regress/driver.pl b/test_regress/driver.pl index ffcfac4a8..541fb296f 100755 --- a/test_regress/driver.pl +++ b/test_regress/driver.pl @@ -77,7 +77,6 @@ my $opt_gdbbt; my $opt_gdbsim; my $opt_hashset; my $opt_jobs = 1; -my $opt_optimize; my $opt_quiet; my $opt_rerun; my $opt_rrsim; @@ -104,7 +103,6 @@ if (! GetOptions( "hashset=s" => \$opt_hashset, "help" => \&usage, "j=i" => \$opt_jobs, - "optimize:s" => \$opt_optimize, "quiet!" => \$opt_quiet, "rerun!" => \$opt_rerun, "rr!" => \$opt_rr, @@ -661,7 +659,7 @@ sub new { verilator_define => 'VERILATOR', verilator_flags => ["-cc", "-Mdir $self->{obj_dir}", - "-OD", # As currently disabled unless -O3 + "--fdedup", # As currently disabled unless -O3 "--debug-check", "--comp-limit-members 10", ], verilator_flags2 => [], @@ -924,7 +922,6 @@ sub compile_vlt_flags { unshift @verilator_flags, "--trace" if $opt_trace; my $threads = ::calc_threads($Vltmt_threads); unshift @verilator_flags, "--threads $threads" if $param{vltmt} && $checkflags !~ /-threads /; - unshift @verilator_flags, "--trace-threads 1" if $param{vltmt} && $checkflags =~ /-trace /; unshift @verilator_flags, "--trace-threads 2" if $param{vltmt} && $checkflags =~ /-trace-fst /; unshift @verilator_flags, "--debug-partition" if $param{vltmt}; unshift @verilator_flags, "-CFLAGS -ggdb -LDFLAGS -ggdb" if $opt_gdbsim; @@ -935,19 +932,6 @@ sub compile_vlt_flags { $param{make_main} && $param{verilator_make_gmake}; unshift @verilator_flags, "../" . $self->{main_filename} if $param{make_main} && $param{verilator_make_gmake}; - if (defined $opt_optimize) { - my $letters = ""; - if ($opt_optimize =~ /[a-zA-Z]/) { - $letters = $opt_optimize; - } else { # Randomly turn on/off different optimizations - foreach my $l ('a' .. 'z') { - $letters .= ((rand() > 0.5) ? $l : uc $l); - } - unshift @verilator_flags, "--trace" if rand() > 0.5; - unshift @verilator_flags, "--coverage" if rand() > 0.5; - } - unshift @verilator_flags, "--O" . $letters; - } my @cmdargs = ( "--prefix " . $param{VM_PREFIX}, @@ -2907,11 +2891,6 @@ Displays this message and program version and exits. Run number of parallel tests, or 0 to determine the count based on the number of cores installed. Requires Perl's Parallel::Forker package. -=item --optimize - -Randomly turn on/off different optimizations. With specific flags, -use those optimization settings - =item --quiet Suppress all output except for failures and progress messages every 15 diff --git a/test_regress/t/t_altera_lpm_mult_noinl.pl b/test_regress/t/t_altera_lpm_mult_noinl.pl index 2eac39a3a..63f8aa315 100755 --- a/test_regress/t/t_altera_lpm_mult_noinl.pl +++ b/test_regress/t/t_altera_lpm_mult_noinl.pl @@ -15,7 +15,7 @@ top_filename("t/t_altera_lpm.v"); $module =~ s/_noinl//; compile( - verilator_flags2 => ["--top-module ${module}", "-Oi"] + verilator_flags2 => ["--top-module ${module}", "-fno-inline"] ); ok(1); diff --git a/test_regress/t/t_alw_noreorder.pl b/test_regress/t/t_alw_noreorder.pl index 46d021e6b..edc2a6f7b 100755 --- a/test_regress/t/t_alw_noreorder.pl +++ b/test_regress/t/t_alw_noreorder.pl @@ -12,7 +12,7 @@ scenarios(vlt_all => 1); top_filename("t/t_alw_reorder.v"); compile( - verilator_flags2 => ["--stats -Or"], + verilator_flags2 => ["--stats -fno-reorder"], ); file_grep($Self->{stats}, qr/Optimizations, Split always\s+(\d+)/i, 0); diff --git a/test_regress/t/t_assign_inline.pl b/test_regress/t/t_assign_inline.pl index 27414cae0..1683d1777 100755 --- a/test_regress/t/t_assign_inline.pl +++ b/test_regress/t/t_assign_inline.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ["-O0 -OG"], + verilator_flags2 => ["-O0 -fgate"], ); execute( diff --git a/test_regress/t/t_assign_slice_overflow_ox.pl b/test_regress/t/t_assign_slice_overflow_ox.pl index 5251be495..8702b94fe 100755 --- a/test_regress/t/t_assign_slice_overflow_ox.pl +++ b/test_regress/t/t_assign_slice_overflow_ox.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t_assign_slice_overflow.v"); compile( - verilator_flags2 => ["-Ox"], + verilator_flags2 => ["-fno-expand"], ); execute( diff --git a/test_regress/t/t_case_66bits_noexpand.pl b/test_regress/t/t_case_66bits_noexpand.pl index fae2f640f..738da6174 100755 --- a/test_regress/t/t_case_66bits_noexpand.pl +++ b/test_regress/t/t_case_66bits_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_case_66bits.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_case_incrdecr.pl b/test_regress/t/t_case_incrdecr.pl index abbcf936a..729c0cc8a 100755 --- a/test_regress/t/t_case_incrdecr.pl +++ b/test_regress/t/t_case_incrdecr.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ["--trace --Os -x-assign 0"], + verilator_flags2 => ["--trace --fno-split -x-assign 0"], ); execute( diff --git a/test_regress/t/t_case_write1.pl b/test_regress/t/t_case_write1.pl index 4fa36576d..33e2bb517 100755 --- a/test_regress/t/t_case_write1.pl +++ b/test_regress/t/t_case_write1.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ["--stats --O3 -x-assign fast"], + verilator_flags2 => ["--stats -O3 -x-assign fast"], ); execute( diff --git a/test_regress/t/t_case_write1_noexpand.pl b/test_regress/t/t_case_write1_noexpand.pl index cadb667e6..48c57c39a 100755 --- a/test_regress/t/t_case_write1_noexpand.pl +++ b/test_regress/t/t_case_write1_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_case_write1.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_case_write2.pl b/test_regress/t/t_case_write2.pl index 4fa36576d..33e2bb517 100755 --- a/test_regress/t/t_case_write2.pl +++ b/test_regress/t/t_case_write2.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ["--stats --O3 -x-assign fast"], + verilator_flags2 => ["--stats -O3 -x-assign fast"], ); execute( diff --git a/test_regress/t/t_trace_c_api.pl b/test_regress/t/t_concat_unpack.pl similarity index 52% rename from test_regress/t/t_trace_c_api.pl rename to test_regress/t/t_concat_unpack.pl index 541970008..1aa73f80a 100755 --- a/test_regress/t/t_trace_c_api.pl +++ b/test_regress/t/t_concat_unpack.pl @@ -2,29 +2,20 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } # DESCRIPTION: Verilator: Verilog Test driver/expect definition # -# Copyright 2003-2013 by Wilson Snyder. This program is free software; you +# Copyright 2022 by Wilson Snyder. This program is free software; you # can redistribute it and/or modify it under the terms of either the GNU # Lesser General Public License Version 3 or the Perl Artistic License # Version 2.0. # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 -scenarios(vlt => 1); +scenarios(simulator => 1); compile( - make_top_shell => 0, - make_main => 0, - v_flags2 => ["--trace --exe $Self->{t_dir}/t_trace_c_api.cpp", - "-CFLAGS -DVERILATED_VCD_TEST", - "-CFLAGS -DVL_TRACE_VCD_OLD_API"], ); execute( check_finished => 1, ); -# vcddiff bug crashes -#vcd_identical("$Self->{obj_dir}/simx.vcd", -# $Self->{golden_filename}); - ok(1); 1; diff --git a/test_regress/t/t_concat_unpack.v b/test_regress/t/t_concat_unpack.v new file mode 100755 index 000000000..8d3f4bac2 --- /dev/null +++ b/test_regress/t/t_concat_unpack.v @@ -0,0 +1,36 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2022 by Wilson Snyder. +// SPDX-License-Identifier: CC0-1.0 + +module t(/*AUTOARG*/ + // Inputs + clk + ); + input clk; + + wire [31:0] arr [0:7]; + assign arr[0:7] = { + {16'hffff, 16'h0000}, + {16'h0000, 16'h0000}, + {16'h0a0a, 16'h0000}, + {16'ha0a0, 16'h0000}, + {16'hffff, 16'h0000}, + {16'h0000, 16'h0000}, + {16'h0a0a, 16'h0000}, + {16'ha0a0, 16'h0000} + }; + + int cyc = 0; + + always @(posedge clk) begin + cyc <= cyc + 1; + if (cyc == 9) begin + if (arr[0] !== 32'hffff0000) $stop; + if (arr[7] !== 32'ha0a00000) $stop; + $write("*-* All Finished *-*\n"); + $finish; + end + end +endmodule diff --git a/test_regress/t/t_const_no_opt.pl b/test_regress/t/t_const_no_opt.pl index 33be39810..79bc15076 100755 --- a/test_regress/t/t_const_no_opt.pl +++ b/test_regress/t/t_const_no_opt.pl @@ -13,7 +13,7 @@ top_filename("t/t_const_opt.v"); # Run the same design as t_const_opt.pl without bitopt tree optimization to make sure that the result is same. compile( - verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-Oo", "$Self->{t_dir}/t_const_opt.cpp"], + verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-fno-const-bit-op-tree", "$Self->{t_dir}/t_const_opt.cpp"], ); execute( diff --git a/test_regress/t/t_const_opt.pl b/test_regress/t/t_const_opt.pl index 26143eb57..83e301744 100755 --- a/test_regress/t/t_const_opt.pl +++ b/test_regress/t/t_const_opt.pl @@ -18,5 +18,8 @@ execute( check_finished => 1, ); +if ($Self->{vlt}) { + file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 11); +} ok(1); 1; diff --git a/test_regress/t/t_const_opt.v b/test_regress/t/t_const_opt.v index be1e49c03..407fef13c 100644 --- a/test_regress/t/t_const_opt.v +++ b/test_regress/t/t_const_opt.v @@ -4,6 +4,11 @@ // any use, without warranty, 2021 Yutetsu TAKATSUKASA. // SPDX-License-Identifier: CC0-1.0 +// This function always returns 0, so safe to take bitwise OR with any value. +// Calling this function stops constant folding as Verialtor does not know +// what this function returns. +import "DPI-C" context function int fake_dependency(); + module t(/*AUTOARG*/ // Inputs clk @@ -57,7 +62,8 @@ module t(/*AUTOARG*/ $write("[%0t] cyc==%0d crc=%x sum=%x\n", $time, cyc, crc, sum); if (crc !== 64'hc77bb9b3784ea091) $stop; // What checksum will we end up with (above print should match) -`define EXPECTED_SUM 64'hcae926ece668f35d +`define EXPECTED_SUM 64'hdccb9e7b8b638233 + if (sum !== `EXPECTED_SUM) $stop; $write("*-* All Finished *-*\n"); $finish; @@ -79,10 +85,11 @@ module Test(/*AUTOARG*/ logic d0, d1, d2, d3, d4, d5, d6, d7; logic bug3182_out; logic bug3197_out; + logic bug3445_out; output logic o; - logic [6:0] tmp; + logic [7:0] tmp; assign o = ^tmp; always_ff @(posedge clk) begin @@ -105,10 +112,12 @@ module Test(/*AUTOARG*/ tmp[4] <= i[0] & (i[1] & (i[2] & (i[3] | d[4]))); // ConstBitOpTreeVisitor::m_frozenNodes tmp[5] <= bug3182_out; tmp[6] <= bug3197_out; + tmp[7] <= bug3445_out; end bug3182 i_bug3182(.in(d[4:0]), .out(bug3182_out)); bug3197 i_bug3197(.clk(clk), .in(d), .out(bug3197_out)); + bug3445 i_bug3445(.clk(clk), .in(d), .out(bug3445_out)); endmodule @@ -116,11 +125,6 @@ module bug3182(in, out); input wire [4:0] in; output wire out; - // This function always returns 0, so safe to take bitwise OR with any value. - // Calling this function stops constant folding as Verialtor does not know - // what this function returns. - import "DPI-C" context function int fake_dependency(); - logic [4:0] bit_source; /* verilator lint_off WIDTH */ @@ -140,3 +144,62 @@ module bug3197(input wire clk, input wire [31:0] in, output out); wire tmp0 = (|d[38:0]); assign out = (d[39] | tmp0); endmodule + + +// Bug #3445 +// An unoptimized node is kept as frozen node, but its LSB and polarity were not saved. +// AST of RHS of result0 looks as below: +// AND(SHIFTR(AND(WORDSEL(ARRAYSEL(VARREF)), WORDSEL(ARRAYSEL(VARREF)))), 32'd11) +// ~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~ +// Two of WORDSELs are frozen nodes. They are under SHIFTR of 11 bits. +// +// Fixing #3445 needs to +// 1. Take AstShiftR and AstNot into op count when diciding optimizable or not +// (result0 and result2 in the test) +// 2. Insert AstShiftR if LSB of the frozen node is not 0 (result1 in the test) +// 3. Insert AstNot if polarity of the frozen node is false (resutl3 in the +// test) +module bug3445(input wire clk, input wire [31:0] in, output wire out); + logic [127:0] d; + always_ff @(posedge clk) + d <= {d[95:0], in}; + + typedef struct packed { + logic a; + logic [ 2:0] b; + logic [ 2:0] c; + logic [ 1:0] d; + logic [ 7:0] e; + logic [31:0] f; + logic [ 3:0] g; + logic [31:0] h; + logic i; + logic [41:0] j; + } packed_struct; + packed_struct st[4]; + + // This is always 1'b0, but Verilator cannot notice it. + // This signal helps to reveal wrong optimization of result2 and result3. + logic zero; + always_ff @(posedge clk) begin + st[0] <= d; + st[1] <= st[0]; + st[2] <= st[1]; + st[3] <= st[2]; + zero <= fake_dependency() > 0; + end + + logic result0, result1, result2, result3; + always_ff @(posedge clk) begin + // Cannot optimize further. + result0 <= (st[0].g[0] & st[0].h[0]) & (in[0] == 1'b0); + // There are redundant !in[0] terms. They should be simplified. + result1 <= (!in[0] & (st[1].g[0] & st[1].h[0])) & ((in[0] == 1'b0) & !in[0]); + // Cannot optimize further. + result2 <= !(st[2].g[0] & st[2].h[0]) & (zero == 1'b0); + // There are redundant zero terms. They should be simplified. + result3 <= (!zero & !(st[3].g[0] & st[3].h[0])) & ((zero == 1'b0) & !zero); + end + + assign out = result0 ^ result1 ^ (result2 | result3); +endmodule diff --git a/test_regress/t/t_emit_constw.pl b/test_regress/t/t_emit_constw.pl index 9b1487fcd..8f7895804 100755 --- a/test_regress/t/t_emit_constw.pl +++ b/test_regress/t/t_emit_constw.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ['--Ox'], + verilator_flags2 => ['--fno-expand'], ); execute( diff --git a/test_regress/t/t_extract_static_const_no_merge.pl b/test_regress/t/t_extract_static_const_no_merge.pl index ff9a694d4..f656fe455 100755 --- a/test_regress/t/t_extract_static_const_no_merge.pl +++ b/test_regress/t/t_extract_static_const_no_merge.pl @@ -14,7 +14,7 @@ top_filename("t/t_extract_static_const.v"); golden_filename("t/t_extract_static_const.out"); compile( - verilator_flags2 => ["--stats", "--no-merge-const-pool"], + verilator_flags2 => ["--stats", "--fno-merge-const-pool"], ); execute( diff --git a/test_regress/t/t_func_twocall_noexpand.pl b/test_regress/t/t_func_twocall_noexpand.pl index 001824bc6..452d4b37a 100755 --- a/test_regress/t/t_func_twocall_noexpand.pl +++ b/test_regress/t/t_func_twocall_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_func_twocall.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_gen_genblk_noinl.pl b/test_regress/t/t_gen_genblk_noinl.pl index 7574a1cfb..ef537cd4d 100755 --- a/test_regress/t/t_gen_genblk_noinl.pl +++ b/test_regress/t/t_gen_genblk_noinl.pl @@ -16,7 +16,7 @@ scenarios(simulator => 1); $Self->{sim_time} = 11000; compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_incr_void.pl b/test_regress/t/t_incr_void.pl index 5b95e5b74..e7d3e18e3 100755 --- a/test_regress/t/t_incr_void.pl +++ b/test_regress/t/t_incr_void.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(simulator => 1); compile( - verilator_flags2 => ["--Os -x-assign 0"], + verilator_flags2 => ["--fno-split -x-assign 0"], ); execute( diff --git a/test_regress/t/t_inst_slice_noinl.pl b/test_regress/t/t_inst_slice_noinl.pl index 11f75c752..aa56e6155 100755 --- a/test_regress/t/t_inst_slice_noinl.pl +++ b/test_regress/t/t_inst_slice_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_inst_slice.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface1_modport_noinl.pl b/test_regress/t/t_interface1_modport_noinl.pl index b077bef4e..4f4b314ae 100755 --- a/test_regress/t/t_interface1_modport_noinl.pl +++ b/test_regress/t/t_interface1_modport_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface1_modport.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface1_noinl.pl b/test_regress/t/t_interface1_noinl.pl index 3c9d8d316..867b1e993 100755 --- a/test_regress/t/t_interface1_noinl.pl +++ b/test_regress/t/t_interface1_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface1.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface2_noinl.pl b/test_regress/t/t_interface2_noinl.pl index 57b72e7a7..cad1b6e3d 100755 --- a/test_regress/t/t_interface2_noinl.pl +++ b/test_regress/t/t_interface2_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface2.v"); compile( - verilator_flags2 => ["--top-module t -Oi"], + verilator_flags2 => ["--top-module t -fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_array2_noinl.pl b/test_regress/t/t_interface_array2_noinl.pl index ad389d0fb..7bf1518f5 100755 --- a/test_regress/t/t_interface_array2_noinl.pl +++ b/test_regress/t/t_interface_array2_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_array2.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_array_noinl.pl b/test_regress/t/t_interface_array_noinl.pl index 02bf8fd89..df71f77e9 100755 --- a/test_regress/t/t_interface_array_noinl.pl +++ b/test_regress/t/t_interface_array_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_array.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_down_noinl.pl b/test_regress/t/t_interface_down_noinl.pl index fb03fc988..34ce5cb69 100755 --- a/test_regress/t/t_interface_down_noinl.pl +++ b/test_regress/t/t_interface_down_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_down.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen10_noinl.pl b/test_regress/t/t_interface_gen10_noinl.pl index e5c3f22c5..f691c6d0a 100755 --- a/test_regress/t/t_interface_gen10_noinl.pl +++ b/test_regress/t/t_interface_gen10_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen10.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen11_noinl.pl b/test_regress/t/t_interface_gen11_noinl.pl index 82a6a9a27..d1e7dd3c0 100755 --- a/test_regress/t/t_interface_gen11_noinl.pl +++ b/test_regress/t/t_interface_gen11_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen11.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen12_noinl.pl b/test_regress/t/t_interface_gen12_noinl.pl index c3f59ba19..8ebecd448 100755 --- a/test_regress/t/t_interface_gen12_noinl.pl +++ b/test_regress/t/t_interface_gen12_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen12.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen2_noinl.pl b/test_regress/t/t_interface_gen2_noinl.pl index fc7c4bfb1..eb772bab6 100755 --- a/test_regress/t/t_interface_gen2_noinl.pl +++ b/test_regress/t/t_interface_gen2_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen2.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen3_noinl.pl b/test_regress/t/t_interface_gen3_noinl.pl index e49dfc39a..b63c72eb9 100755 --- a/test_regress/t/t_interface_gen3_noinl.pl +++ b/test_regress/t/t_interface_gen3_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen3.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen4_noinl.pl b/test_regress/t/t_interface_gen4_noinl.pl index 4a0b00930..e724c2859 100755 --- a/test_regress/t/t_interface_gen4_noinl.pl +++ b/test_regress/t/t_interface_gen4_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen4.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen5_noinl.pl b/test_regress/t/t_interface_gen5_noinl.pl index 0873ce9c5..5b4852691 100755 --- a/test_regress/t/t_interface_gen5_noinl.pl +++ b/test_regress/t/t_interface_gen5_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen5.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen6_noinl.pl b/test_regress/t/t_interface_gen6_noinl.pl index 4c42c6797..e43d9460a 100755 --- a/test_regress/t/t_interface_gen6_noinl.pl +++ b/test_regress/t/t_interface_gen6_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen6.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen7_noinl.pl b/test_regress/t/t_interface_gen7_noinl.pl index 27cb3ea61..458c5f0f6 100755 --- a/test_regress/t/t_interface_gen7_noinl.pl +++ b/test_regress/t/t_interface_gen7_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen7.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen8_noinl.pl b/test_regress/t/t_interface_gen8_noinl.pl index ba3b2b132..644d9a10e 100755 --- a/test_regress/t/t_interface_gen8_noinl.pl +++ b/test_regress/t/t_interface_gen8_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen8.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen9_noinl.pl b/test_regress/t/t_interface_gen9_noinl.pl index 48f4eb8be..6ac0d6296 100755 --- a/test_regress/t/t_interface_gen9_noinl.pl +++ b/test_regress/t/t_interface_gen9_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen9.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_gen_noinl.pl b/test_regress/t/t_interface_gen_noinl.pl index 5813d42eb..17273106f 100755 --- a/test_regress/t/t_interface_gen_noinl.pl +++ b/test_regress/t/t_interface_gen_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_gen.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_inl.pl b/test_regress/t/t_interface_inl.pl index efb67ed7f..08dfa385c 100755 --- a/test_regress/t/t_interface_inl.pl +++ b/test_regress/t/t_interface_inl.pl @@ -14,7 +14,7 @@ top_filename("t/t_interface.v"); compile( # Avoid inlining so we find bugs in the non-inliner connection code - verilator_flags2 => ["-Oi"], + verilator_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_modport_import_noinl.pl b/test_regress/t/t_interface_modport_import_noinl.pl index 3821fef11..a9e97bee1 100755 --- a/test_regress/t/t_interface_modport_import_noinl.pl +++ b/test_regress/t/t_interface_modport_import_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_modport_import.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_modport_inl.pl b/test_regress/t/t_interface_modport_inl.pl index 9afcd9cdd..eb2ca2181 100755 --- a/test_regress/t/t_interface_modport_inl.pl +++ b/test_regress/t/t_interface_modport_inl.pl @@ -14,7 +14,7 @@ top_filename("t/t_interface_modport.v"); compile( # Avoid inlining so we find bugs in the non-inliner connection code - verilator_flags2 => ["-Oi"], + verilator_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_modport_noinl.pl b/test_regress/t/t_interface_modport_noinl.pl index 4c051df1a..7f1015d23 100755 --- a/test_regress/t/t_interface_modport_noinl.pl +++ b/test_regress/t/t_interface_modport_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_modport.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_mp_func_noinl.pl b/test_regress/t/t_interface_mp_func_noinl.pl index 432a7308a..89f4835b5 100755 --- a/test_regress/t/t_interface_mp_func_noinl.pl +++ b/test_regress/t/t_interface_mp_func_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_mp_func.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_nest_noinl.pl b/test_regress/t/t_interface_nest_noinl.pl index 9d88a39a0..e042d33c1 100755 --- a/test_regress/t/t_interface_nest_noinl.pl +++ b/test_regress/t/t_interface_nest_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_nest.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_noinl.pl b/test_regress/t/t_interface_noinl.pl index 52cb09c98..7be6235ad 100755 --- a/test_regress/t/t_interface_noinl.pl +++ b/test_regress/t/t_interface_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_interface_twod_noinl.pl b/test_regress/t/t_interface_twod_noinl.pl index 18f0adf62..e77089cb0 100755 --- a/test_regress/t/t_interface_twod_noinl.pl +++ b/test_regress/t/t_interface_twod_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_interface_twod.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_lint_setout_bad_noinl.pl b/test_regress/t/t_lint_setout_bad_noinl.pl index 4b5131821..cbbf96bb4 100755 --- a/test_regress/t/t_lint_setout_bad_noinl.pl +++ b/test_regress/t/t_lint_setout_bad_noinl.pl @@ -13,7 +13,7 @@ scenarios(linter => 1); top_filename("t/t_lint_setout_bad.v"); lint( - verilator_flags2 => ["--lint-only -Oi"], + verilator_flags2 => ["--lint-only -fno-inline"], fails => 1, expect_filename => $Self->{golden_filename}, ); diff --git a/test_regress/t/t_math_cond_huge_noexpand.pl b/test_regress/t/t_math_cond_huge_noexpand.pl index 0ae4e3ce4..15399cb9f 100755 --- a/test_regress/t/t_math_cond_huge_noexpand.pl +++ b/test_regress/t/t_math_cond_huge_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_cond_huge.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_div_noexpand.pl b/test_regress/t/t_math_div_noexpand.pl index 4dbcba15c..fa7ecd2ec 100755 --- a/test_regress/t/t_math_div_noexpand.pl +++ b/test_regress/t/t_math_div_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_div.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_eq_noexpand.pl b/test_regress/t/t_math_eq_noexpand.pl index f8b2375c0..2c3907b70 100755 --- a/test_regress/t/t_math_eq_noexpand.pl +++ b/test_regress/t/t_math_eq_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_eq.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_red_noexpand.pl b/test_regress/t/t_math_red_noexpand.pl index 89e54c0c9..655ce0246 100755 --- a/test_regress/t/t_math_red_noexpand.pl +++ b/test_regress/t/t_math_red_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_red.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_shift_noexpand.pl b/test_regress/t/t_math_shift_noexpand.pl index acf420f1a..e27343a72 100755 --- a/test_regress/t/t_math_shift_noexpand.pl +++ b/test_regress/t/t_math_shift_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_shift.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_signed_noexpand.pl b/test_regress/t/t_math_signed_noexpand.pl index 336d35594..b086af557 100755 --- a/test_regress/t/t_math_signed_noexpand.pl +++ b/test_regress/t/t_math_signed_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_signed.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_math_vliw_noexpand.pl b/test_regress/t/t_math_vliw_noexpand.pl index fce202e04..5ca1e425f 100755 --- a/test_regress/t/t_math_vliw_noexpand.pl +++ b/test_regress/t/t_math_vliw_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_math_vliw.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_mem_multi_io.pl b/test_regress/t/t_mem_multi_io.pl index 1691d75f1..4e371f1d7 100755 --- a/test_regress/t/t_mem_multi_io.pl +++ b/test_regress/t/t_mem_multi_io.pl @@ -12,7 +12,7 @@ scenarios(simulator => 1); compile( # Disable inlining, this test is trivial without it - verilator_flags2 => ["-Oi --trace"], + verilator_flags2 => ["-fno-inline --trace"], verilator_flags3 => [], ); diff --git a/test_regress/t/t_mem_multi_io2_cc.pl b/test_regress/t/t_mem_multi_io2_cc.pl index 3edda698b..bfd551aed 100755 --- a/test_regress/t/t_mem_multi_io2_cc.pl +++ b/test_regress/t/t_mem_multi_io2_cc.pl @@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v"); compile( make_top_shell => 0, make_main => 0, - verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -Oi"], + verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -fno-inline"], verilator_flags3 => [], ); diff --git a/test_regress/t/t_mem_multi_io2_sc.pl b/test_regress/t/t_mem_multi_io2_sc.pl index 11ae8cbfc..2fb4bf70c 100755 --- a/test_regress/t/t_mem_multi_io2_sc.pl +++ b/test_regress/t/t_mem_multi_io2_sc.pl @@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v"); compile( make_top_shell => 0, make_main => 0, - verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -Oi"], + verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -fno-inline"], ); execute( diff --git a/test_regress/t/t_mem_multi_io3_cc.pl b/test_regress/t/t_mem_multi_io3_cc.pl index 4ad019dbf..b6090a775 100755 --- a/test_regress/t/t_mem_multi_io3_cc.pl +++ b/test_regress/t/t_mem_multi_io3_cc.pl @@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v"); compile( make_top_shell => 0, make_main => 0, - verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -Oi"], + verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -fno-inline"], verilator_flags3 => [], ); diff --git a/test_regress/t/t_mem_multi_io3_sc.pl b/test_regress/t/t_mem_multi_io3_sc.pl index 5825c7845..f37d9dedd 100755 --- a/test_regress/t/t_mem_multi_io3_sc.pl +++ b/test_regress/t/t_mem_multi_io3_sc.pl @@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v"); compile( make_top_shell => 0, make_main => 0, - verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -Oi"], + verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -fno-inline"], verilator_flags3 => [], ); diff --git a/test_regress/t/t_mem_multidim_Ox.pl b/test_regress/t/t_mem_multidim_Ox.pl index bb4dbc122..ccde0bbbd 100755 --- a/test_regress/t/t_mem_multidim_Ox.pl +++ b/test_regress/t/t_mem_multidim_Ox.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mem_multidim.v"); compile( - verilator_flags2 => ['--Ox'], + verilator_flags2 => ['--fno-expand'], ); execute( diff --git a/test_regress/t/t_mem_packed_noexpand.pl b/test_regress/t/t_mem_packed_noexpand.pl index d5fc2b5da..df4c82d6d 100755 --- a/test_regress/t/t_mem_packed_noexpand.pl +++ b/test_regress/t/t_mem_packed_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_mem_packed.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute( diff --git a/test_regress/t/t_merge_cond.pl b/test_regress/t/t_merge_cond.pl index 51f97242d..971a808af 100755 --- a/test_regress/t/t_merge_cond.pl +++ b/test_regress/t/t_merge_cond.pl @@ -21,11 +21,11 @@ execute( if ($Self->{vlt}) { # Note, with vltmt this might be split differently, so only checking vlt file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i, - 10); + 9); file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i, 580); file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i, - 64); + 128); } ok(1); diff --git a/test_regress/t/t_merge_cond_blowup.pl b/test_regress/t/t_merge_cond_blowup.pl new file mode 100755 index 000000000..aa9e8e1fe --- /dev/null +++ b/test_regress/t/t_merge_cond_blowup.pl @@ -0,0 +1,34 @@ +#!/usr/bin/env perl +if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2022 by Geza Lore. This program is free software; you +# can redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +scenarios(vlt => 1); + +# TODO: This takes excessively long on vltmt, this should be fixed + +compile( + verilator_flags2 => ["--unroll-count 1000000000", "--output-split 0", "--stats"], + ); + +execute( + check_finished => 1, + ); + +if ($Self->{vlt}) { + # Note, with vltmt this might be split differently, so only checking vlt + file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i, + 500); # V3MergeCond.cpp MAX_DISTANCE + file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i, + 1000); # V3MergeCond.cpp MAX_DISTANCE *2 + file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i, + 2); +} + +ok(1); +1; diff --git a/test_regress/t/t_merge_cond_blowup.v b/test_regress/t/t_merge_cond_blowup.v new file mode 100644 index 000000000..aa97f8f26 --- /dev/null +++ b/test_regress/t/t_merge_cond_blowup.v @@ -0,0 +1,55 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed under the Creative Commons Public Domain, for +// any use, without warranty, 2022 by Geza Lore. +// SPDX-License-Identifier: CC0-1.0 + +module t (/*AUTOARG*/ + // Inputs + clk + ); + input clk; + + localparam int N = 4096; + + integer cyc = 0; + reg [63:0] crc= 64'h5aef0c8d_d70a4497; + + always @ (posedge clk) begin + cyc <= cyc + 1; + crc <= {crc[62:0], crc[63] ^ crc[2] ^ crc[0]}; + + if (cyc==99) begin + $write("*-* All Finished *-*\n"); + $finish; + end + end + + reg a [N-1:0]; + reg b [N-1:0]; + + // This yields pathological complexity for the current conditional merging + // algorithm. Note in practice, other parts of the compiler blow up on this + // code far earlier than the conditional merging, but here we go anyway. + generate + genvar i; + for (i = 0 ; i < N ; i = i + 1) begin + always @(posedge clk) a[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64]; + end + for (i = 0 ; i < N ; i = i + 1) begin + always @(posedge clk) b[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64]; + end + endgenerate + + always @(posedge clk) begin + if (cyc >= 2) begin + for (int i = 0 ; i < N ; i = i + 1) begin + if (a[i] !== b[i]) begin + $write("%%Error: %s:%0d: cyc=%0d i=%0d a[i]='h%x b[i]='h%x\n", `__FILE__,`__LINE__, cyc, i, a[i], b[i]); + $stop; + end + end + end + end + +endmodule diff --git a/test_regress/t/t_mod_interface_array0_noinl.pl b/test_regress/t/t_mod_interface_array0_noinl.pl index 3c74fd016..56032e0d9 100755 --- a/test_regress/t/t_mod_interface_array0_noinl.pl +++ b/test_regress/t/t_mod_interface_array0_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mod_interface_array0.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_mod_interface_array1_noinl.pl b/test_regress/t/t_mod_interface_array1_noinl.pl index 34871282a..651bb1c65 100755 --- a/test_regress/t/t_mod_interface_array1_noinl.pl +++ b/test_regress/t/t_mod_interface_array1_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mod_interface_array1.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_mod_interface_array2_noinl.pl b/test_regress/t/t_mod_interface_array2_noinl.pl index c19612e57..2afa9e020 100755 --- a/test_regress/t/t_mod_interface_array2_noinl.pl +++ b/test_regress/t/t_mod_interface_array2_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mod_interface_array2.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_mod_interface_array4_noinl.pl b/test_regress/t/t_mod_interface_array4_noinl.pl index 6797c1016..62ad2ca24 100755 --- a/test_regress/t/t_mod_interface_array4_noinl.pl +++ b/test_regress/t/t_mod_interface_array4_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mod_interface_array4.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_mod_interface_array6_noinl.pl b/test_regress/t/t_mod_interface_array6_noinl.pl index 5244ac42c..f07ea1917 100755 --- a/test_regress/t/t_mod_interface_array6_noinl.pl +++ b/test_regress/t/t_mod_interface_array6_noinl.pl @@ -13,7 +13,7 @@ scenarios(simulator => 1); top_filename("t/t_mod_interface_array6.v"); compile( - v_flags2 => ["-Oi"], + v_flags2 => ["-fno-inline"], ); execute( diff --git a/test_regress/t/t_optm_if_cond.pl b/test_regress/t/t_optm_if_cond.pl index 91aa0aae5..3215fbf08 100755 --- a/test_regress/t/t_optm_if_cond.pl +++ b/test_regress/t/t_optm_if_cond.pl @@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di scenarios(vlt => 1); compile( - verilator_flags2 => ['--stats', "-Ow"], + verilator_flags2 => ['--stats', "-fno-merge-cond"], ); if ($Self->{vlt_all}) { diff --git a/test_regress/t/t_trace_c_api.cpp b/test_regress/t/t_trace_c_api.cpp deleted file mode 100644 index d2d3f0921..000000000 --- a/test_regress/t/t_trace_c_api.cpp +++ /dev/null @@ -1,24 +0,0 @@ -// -*- mode: C++; c-file-style: "cc-mode" -*- -// -// DESCRIPTION: Verilator: Verilog Test module -// -// This file ONLY is placed under the Creative Commons Public Domain, for -// any use, without warranty, 2008 by Wilson Snyder. -// SPDX-License-Identifier: CC0-1.0 - -#include -#include - -#include VM_PREFIX_INCLUDE - -double sc_time_stamp() { return 0; } - -extern void vcdTestMain(const char* filenamep); - -int main(int argc, char** argv, char** env) { - const char* filenamep = VL_STRINGIFY(TEST_OBJ_DIR) "/simx.vcd"; - printf("Writing %s\n", filenamep); - vcdTestMain(filenamep); - printf("*-* All Finished *-*\n"); - return 0; -} diff --git a/test_regress/t/t_trace_c_api.v b/test_regress/t/t_trace_c_api.v deleted file mode 100644 index 7b440cb91..000000000 --- a/test_regress/t/t_trace_c_api.v +++ /dev/null @@ -1,8 +0,0 @@ -// DESCRIPTION: Verilator: Verilog Test module -// -// This file ONLY is placed under the Creative Commons Public Domain, for -// any use, without warranty, 2013 by Wilson Snyder. -// SPDX-License-Identifier: CC0-1.0 - -module t; -endmodule diff --git a/test_regress/t/t_trace_complex_old_api.pl b/test_regress/t/t_trace_complex_old_api.pl deleted file mode 100755 index 8136d3f79..000000000 --- a/test_regress/t/t_trace_complex_old_api.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env perl -if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } -# DESCRIPTION: Verilator: Verilog Test driver/expect definition -# -# Copyright 2003-2009 by Wilson Snyder. This program is free software; you -# can redistribute it and/or modify it under the terms of either the GNU -# Lesser General Public License Version 3 or the Perl Artistic License -# Version 2.0. -# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 - -# Same test as t_trace_complex, but exercising the old VCD tracing API - -scenarios(vlt => 1); - -top_filename("t/t_trace_complex.v"); -golden_filename("t/t_trace_complex.out"); - -compile( - verilator_flags2 => ['--cc --trace -CFLAGS -DVL_TRACE_VCD_OLD_API'], - ); - -execute( - check_finished => 1, - ); - -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp /); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru\[/); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\[/); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\[/); -file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\[/); - -vcd_identical("$Self->{obj_dir}/simx.vcd", $Self->{golden_filename}); - -ok(1); -1; diff --git a/test_regress/t/t_unpacked_concat_bad.out b/test_regress/t/t_unpacked_concat_bad.out index 4c89adfe6..1482e7507 100644 --- a/test_regress/t/t_unpacked_concat_bad.out +++ b/test_regress/t/t_unpacked_concat_bad.out @@ -1,23 +1,6 @@ -%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type +%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Non-1 replication to form 'bit[31:0]$[1:0]' data type : ... In instance t 17 | localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}}; | ^ ... For error description see https://verilator.org/warn/UNSUPPORTED?v=latest -%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:17:47: Unsized numbers/parameters not allowed in replications. - : ... In instance t - 17 | localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}}; - | ^~~~~ - ... Use "/* verilator lint_off WIDTHCONCAT */" and lint_on around source to disable this message. -%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:18:45: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type - : ... In instance t - 18 | localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)}; - | ^ -%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:46: Unsized numbers/parameters not allowed in concatenations. - : ... In instance t - 18 | localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)}; - | ^~~~~ -%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:60: Unsized numbers/parameters not allowed in replications. - : ... In instance t - 18 | localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)}; - | ^ %Error: Exiting due to diff --git a/test_regress/t/t_var_assign_landr_noexpand.pl b/test_regress/t/t_var_assign_landr_noexpand.pl index cd058334d..e616f77c3 100755 --- a/test_regress/t/t_var_assign_landr_noexpand.pl +++ b/test_regress/t/t_var_assign_landr_noexpand.pl @@ -13,7 +13,7 @@ scenarios(vlt => 1); top_filename("t/t_var_assign_landr.v"); compile( - verilator_flags2 => ['-Ox'], + verilator_flags2 => ['-fno-expand'], ); execute(