diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 47b5f70b2..87310899f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,7 +29,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
         compiler:
           - { cc: clang, cxx: clang++ }
           - { cc: gcc,   cxx: g++     }
@@ -37,9 +37,11 @@ jobs:
         exclude:
           # Build pull requests only with ubuntu-20.04 and without m32
           - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
           - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
           # Build -m32 only on ubuntu-20.04
           - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
         include:
           # Build GCC 10 on ubuntu-20.04
           - os: ubuntu-20.04
@@ -95,7 +97,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, ubuntu-18.04]
+        os: [ubuntu-22.04, ubuntu-20.04, ubuntu-18.04]
         compiler:
           - { cc: clang, cxx: clang++ }
           - { cc: gcc,   cxx: g++     }
@@ -104,9 +106,11 @@ jobs:
         exclude:
           # Build pull requests only with ubuntu-20.04 and without m32
           - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-18.04' || 'do-not-exclude' }}
+          - os:  ${{ github.event_name == 'pull_request' && 'ubuntu-22.04' || 'do-not-exclude' }}
           - m32: ${{ github.event_name == 'pull_request' && 1              || 'do-not-exclude' }}
           # Build -m32 only on ubuntu-20.04
           - {os: ubuntu-18.04, m32: 1}
+          - {os: ubuntu-22.04, m32: 1}
         include:
           # Test with GCC 10 on ubuntu-20.04 without m32
           - {os: ubuntu-20.04, compiler: { cc: gcc-10, cxx: g++-10 }, m32: 0, suite: dist-vlt-0}
@@ -122,7 +126,7 @@ jobs:
       CI_M32: ${{ matrix.m32 }}
       CC: ${{ matrix.compiler.cc }}
       CXX: ${{ matrix.compiler.cxx }}
-      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${ matrix.suite }}
+      CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${{ matrix.suite }}
       CCACHE_MAXSIZE: 64M # Per build matrix entry (2160M in total)
       VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz
     steps:
diff --git a/Changes b/Changes
index e2a165c88..69928d82d 100644
--- a/Changes
+++ b/Changes
@@ -22,12 +22,20 @@ Verilator 5.001 devel
 Verilator 4.223 devel
 ==========================
 
+**Major:**
+
+* VCD tracing is now parallelized with --threads (#3449). [Geza Lore, Shunyao CAD]
+
 **Minor:**
 
+* Add -f<optimization> options to replace -O<letter> options (#3436).
+* Changed --no-merge-const-pool to -fno-merge-const-pool (#3436).
 * Support compile time trace signal selection with tracing_on/off (#3323). [Shunyao CAD]
-* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
-* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
 * Support non-ANSI interface port declarations (#3439). [Geza Lore, Shunyao CAD]
+* Support concat assignment to packed array (#3446).
+* Improve conditional merging optimization (#3125). [Geza Lore, Shunyao CAD]
+* Define VM_TRACE_VCD when tracing in VCD format. [Geza Lore, Shunyao CAD]
+* Add assert when VerilatedContext is mis-deleted (#3121). [Rupert Swarbrick]
 * Fix hang with large case statement optimization (#3405). [Mike Urbach]
 * Fix 'with' operator with type casting (#3387). [xiak95]
 * Fix incorrect conditional merging (#3409). [Raynard Qiao]
diff --git a/bin/verilator b/bin/verilator
index d936ebf56..7d27dad5d 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -319,6 +319,7 @@ detailed descriptions of these arguments.
      -f <file>                  Parse arguments from a file
      -FI <file>                 Force include of a file
     --flatten                   Force inlining of all modules, tasks and functions
+     -fno-<optimization>        Disable internal optimization stage
      -G<name>=<value>           Overwrite top-level parameter
     --gdb                       Run Verilator under GDB interactively
     --gdbbt                     Run Verilator under GDB for backtrace
@@ -344,7 +345,6 @@ detailed descriptions of these arguments.
     --MMD                       Create .d dependency files
     --MP                        Create phony dependency targets
     --Mdir <directory>          Name of output object directory
-    --no-merge-const-pool       Disable merging of different types in const pool
     --mod-prefix <topname>      Name to prepend to lower classes
     --no-clk <signal-name>      Prevent marking specified signal as clock
     --no-decoration             Disable comments and symbol decorations
@@ -404,7 +404,7 @@ detailed descriptions of these arguments.
     --trace-max-width <width>   Maximum array depth for tracing
     --trace-params              Enable tracing of parameters
     --trace-structs             Enable tracing structure names
-    --trace-threads <threads>   Enable waveform creation on separate threads
+    --trace-threads <threads>   Enable FST waveform creation on separate threads
     --trace-underscore          Enable tracing of _signals
      -U<var>                    Undefine preprocessor define
     --unroll-count <loops>      Tune maximum loop iterations
diff --git a/ci/ci-install.bash b/ci/ci-install.bash
index f258916b4..4f61f06c4 100755
--- a/ci/ci-install.bash
+++ b/ci/ci-install.bash
@@ -54,8 +54,12 @@ if [ "$CI_BUILD_STAGE_NAME" = "build" ]; then
 
   if [ "$CI_OS_NAME" = "linux" ]; then
     sudo apt-get update
-    sudo apt-get install libfl-dev libgoogle-perftools-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    sudo apt-get install libfl-dev ccache
+    if [ "$CI_RUNS_ON" != "ubuntu-22.04" ]; then
+      # Some conflict of libunwind verison on 22.04, can live without it for now
+      sudo apt-get install libgoogle-perftools-dev
+    fi
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
       sudo apt-get install libsystemc libsystemc-dev
     fi
     if [ "$COVERAGE" = 1 ]; then
@@ -85,7 +89,7 @@ elif [ "$CI_BUILD_STAGE_NAME" = "test" ]; then
     sudo apt-get update
     # libfl-dev needed for internal coverage's test runs
     sudo apt-get install gdb gtkwave lcov libfl-dev ccache
-    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ]; then
+    if [ "$CI_RUNS_ON" = "ubuntu-20.04" ] || [ "$CI_RUNS_ON" = "ubuntu-22.04" ]; then
       sudo apt-get install libsystemc-dev
     fi
     if [ "$CI_M32" = 1 ]; then
diff --git a/configure.ac b/configure.ac
index 2a2b99924..20fade5f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -348,14 +348,18 @@ AC_SUBST(CFG_CXXFLAGS_PROFILE)
 
 # Flag to select newest language standard supported
 # Macros work such that first option that passes is the one we take
-# Currently enabled c++14 due to packaged SystemC dependency
-# c++14 is the newest that Verilator is regressed to support
+# Currently enable c++17/c++14 due to packaged SystemC dependency
+# c++17 is the newest that Verilator is regularly tested to support
 # c++11 is the oldest that Verilator supports
 # gnu is requried for Cygwin to compile verilated.h successfully
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++20)
 #_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++20)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
-#_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+case "$(which lsb_release 2>&1 > /dev/null && lsb_release -d)" in
+*Ubuntu*22.04*)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++17)
+_MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++17)
+;;
+esac
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=c++14)
 _MY_CXX_CHECK_SET(CFG_CXXFLAGS_STD_NEWEST,-std=gnu++11)
diff --git a/docs/CONTRIBUTORS b/docs/CONTRIBUTORS
index 8079639e2..d598cebd5 100644
--- a/docs/CONTRIBUTORS
+++ b/docs/CONTRIBUTORS
@@ -35,6 +35,7 @@ Guokai Chen
 Harald Heckmann
 Howard Su
 Huang Rui
+Huanghuang Zhou
 HungMingWu
 HyungKi Jeong
 Iru Cai
diff --git a/docs/guide/deprecations.rst b/docs/guide/deprecations.rst
index 33c2ef610..8c0038453 100644
--- a/docs/guide/deprecations.rst
+++ b/docs/guide/deprecations.rst
@@ -20,6 +20,11 @@ Option `--cdc`
   The experimental `--cdc` option is believed to be generally unused and is
   planned for removal no sooner than January 2023.
 
+Option `-O<letter>`
+  The debug `-O<letter>` options have been replaced with
+  `-fno-<optimization>` debug options to match GCC. The old options are
+  planned for removal no sooner than June 2023.
+
 Option `--prof-threads`
   The `--prof-threads` option has been superseded by the `--prof-exec` and
   `--prof-pgo` options and is planned for removal no sooner than April 2023.
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 7a8e791f5..8cc56fa89 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -428,6 +428,52 @@ Summary:
    flattening large designs may require significant CPU time, memory and
    storage.
 
+.. option:: -fno-acyc-simp
+
+.. option:: -fno-assemble
+
+.. option:: -fno-case
+
+.. option:: -fno-combine
+
+.. option:: -fno-const
+
+.. option:: -fno-const-bit-op-tree
+
+.. option:: -fno-dedup
+
+.. option:: -fno-expand
+
+.. option:: -fno-gate
+
+.. option:: -fno-inline
+
+.. option:: -fno-life
+
+.. option:: -fno-life-post
+
+.. option:: -fno-localize
+
+.. option:: -fno-merge-cond
+
+.. option:: -fno-merge-const-pool
+
+.. option:: -fno-reloop
+
+.. option:: -fno-reorder
+
+.. option:: -fno-split
+
+.. option:: -fno-subst
+
+.. option:: -fno-subst-const
+
+.. option:: -fno-table
+
+   Rarely needed. Disables one of the internal optimization steps. These
+   are typically used only when recommended by a maintainer to help debug
+   or work around an issue.
+
 .. option:: -G<name>=<value>
 
    Overwrites the given parameter of the toplevel module. The value is
@@ -645,13 +691,6 @@ Summary:
    The directory is created if it does not exist and the parent directories
    exist; otherwise manually create the Mdir before calling Verilator.
 
-.. option:: --no-merge-const-pool
-
-   Rarely needed.  In order to minimize cache footprint, values of different
-   data type, that are yet emitted identically in C++ are merged in the
-   constant pool.  This option disables this and causes every constant pool
-   entry with a distinct data type to be emitted separately.
-
 .. option:: --mod-prefix <topname>
 
    Specifies the name to prepend to all lower level classes.  Defaults to
@@ -700,9 +739,9 @@ Summary:
 
    Rarely needed.  Enables or disables a specific optimizations, with the
    optimization selected based on the letter passed.  A lowercase letter
-   disables an optimization, an upper case letter enables it.  This is
-   intended for debugging use only; see the source code for
-   version-dependent mappings of optimizations to -O letters.
+   disables an optimization, an upper case letter enables it.  This option
+   is deprecated and the various `-f<optimization>` arguments should be
+   used instead.
 
 .. option:: -o <executable>
 
@@ -1042,7 +1081,8 @@ Summary:
    is not thread safe. With "--threads 1", the generated model is single
    threaded but may run in a multithreaded environment. With "--threads N",
    where N >= 2, the model is generated to run multithreaded on up to N
-   threads. See :ref:`Multithreading`.
+   threads. See :ref:`Multithreading`. This option also applies to
+   :vlopt:`--trace` (but not :vlopt:`--trace-fst`).
 
 .. option:: --threads-dpi all
 
@@ -1120,7 +1160,8 @@ Summary:
    Having tracing compiled in may result in some small performance losses,
    even when tracing is not turned on during model execution.
 
-   See also :vlopt:`--trace-threads` option.
+   When using :vlopt:`--threads`, VCD tracing is parallelized, using the
+   same number of threads as passed to :vlopt:`--threads`.
 
 .. option:: --trace-coverage
 
@@ -1174,12 +1215,12 @@ Summary:
 .. option:: --trace-threads *threads*
 
    Enable waveform tracing using separate threads. This is typically faster
-   in simulation runtime but uses more total compute. This option is
-   independent of, and works with, both :vlopt:`--trace` and
-   :vlopt:`--trace-fst`.  Different trace formats can take advantage of
-   more trace threads to varying degrees. Currently VCD tracing can utilize
-   at most "--trace-threads 1", and FST tracing can utilize at most
-   "--trace-threads 2". This overrides :vlopt:`--no-threads` .
+   in simulation runtime but uses more total compute. This option only
+   applies to :vlopt:`--trace-fst`. FST tracing can utilize at most
+   "--trace-threads 2". This overrides :vlopt:`--no-threads`.
+
+   This option is accepted, but has absolutely no effect with
+   :vlopt:`--trace`, which respects :vlopt:`--threads` instead.
 
 .. option:: --trace-underscore
 
diff --git a/docs/guide/faq.rst b/docs/guide/faq.rst
index 5cc4acd43..0b70ea289 100644
--- a/docs/guide/faq.rst
+++ b/docs/guide/faq.rst
@@ -72,23 +72,38 @@ a good thing for getting working silicon.
 Will Verilator output remain under my own license/copyright?
 """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-Yes, it's just like using GCC on your programs; this is why Verilator uses
-the "GNU **Lesser** Public License Version 3" instead of the more typical
-"GNU Public License".  See the licenses for details, but in brief, if you
-change Verilator itself or the header files Verilator includes, you must
-make the source code available under the GNU Lesser Public License.
-However, Verilator output (the Verilated code) only "include"s the licensed
-files, and so you are **not** required to open-source release any output
-from Verilator.
+Your SystemVerilog, VPI/DPI, or main() C++ code remains under your own license.
+
+It's just like how using GCC on your programs does not change the copyright
+of your program; this is why Verilator uses the "GNU **Lesser** Public
+License Version 3" instead of the more typical "GNU Public License".  See
+the licenses for details.
+
+Some examples:
+
+* Any SystemVerilog or other input fed into Verilator remain your own.
+
+* Any of your VPI/DPI C++ routines that Verilator calls remain your own.
+
+* Any of your main() C++ code that calls into Verilator remain your own.
+
+* If you change Verilator itself, for example changing or adding a file
+  under the src/ directory in the repository, you must make the source code
+  available under the GNU Lesser Public License.
+
+* If you change a header Verilator provides, for example under include/ in
+  the repository, you must make the source code available under the GNU
+  Lesser Public License.
 
 You also have the option of using the Perl Artistic License, which again
-does not require you to release your Verilog or generated code, and also
-allows you to modify Verilator for internal use without distributing the
-modified version.  But please contribute back to the community!
+does not require you to release your Verilog, C++, or generated code. This
+license also allows you to modify Verilator for internal use without
+distributing the modified version.  But please contribute back to the
+community!
 
-One limit is that you cannot under either license release a closed-source
-Verilog simulation product incorporating Verilator. That is you can have a
-commercial product, but must make the source code available.
+Under both license you can offer a commercial product that is based on
+Verilator either directly or embedded within.  However under both licenses,
+any changes you make to Verilator for such a product must be open sourced.
 
 As is standard with Open Source, contributions back to Verilator will be
 placed under the Verilator copyright and LGPL/Artistic license.  Small test
diff --git a/docs/guide/verilating.rst b/docs/guide/verilating.rst
index f443ca298..2af18c1f0 100644
--- a/docs/guide/verilating.rst
+++ b/docs/guide/verilating.rst
@@ -221,9 +221,13 @@ model, it may be beneficial to performance to adjust the
 influences the partitioning of the model by adjusting the assumed execution
 time of DPI imports.
 
-The :vlopt:`--trace-threads` options can be used to produce trace dumps
-using multiple threads. If :vlopt:`--trace-threads` is set without
-:vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
+When using :vlopt:`--trace` to perform VCD tracing, the VCD trace
+construction is parallelized using the same number of threads as specified
+with :vlopt:`--threads`, and is executed on the same thread pool as the model.
+
+The :vlopt:`--trace-threads` options can be used with :vlopt:`--trace-fst`
+to offload FST tracing using multiple threads. If :vlopt:`--trace-threads` is
+given without :vlopt:`--threads`, then :vlopt:`--trace-threads` will imply
 :vlopt:`--threads 1 <--threads>`, i.e.: the support libraries will be
 thread safe.
 
@@ -231,12 +235,12 @@ With :vlopt:`--trace-threads 0 <--trace-threads>`, trace dumps are produced
 on the main thread. This again gives the highest single thread performance.
 
 With :vlopt:`--trace-threads {N} <--trace-threads>`, where N is at least 1,
-N additional threads will be created and managed by the trace files (e.g.:
-VerilatedVcdC or VerilatedFstC), to generate the trace dump. The main
-thread will be released to proceed with execution as soon as possible,
-though some blocking of the main thread is still necessary while capturing
-the trace. Different trace formats can utilize a various number of
-threads. See the :vlopt:`--trace-threads` option.
+up to N additional threads will be created and managed by the trace files
+(e.g.: VerilatedFstC), to offload construction of the trace dump. The main
+thread will be released to proceed with execution as soon as possible, though
+some blocking of the main thread is still necessary while capturing the
+trace. FST tracing can utilize up to 2 offload threads, so there is no use
+of setting :vlopt:`--trace-threads` higher than 2 at the moment.
 
 When running a multithreaded model, the default Linux task scheduler often
 works against the model, by assuming threads are short lived, and thus
@@ -441,7 +445,7 @@ SystemC include directories and link to the SystemC libraries.
 
 .. describe:: TRACE_THREADS
 
-   Optional. Generated multi-threaded trace dumping, same as
+   Optional. Generated multi-threaded FST trace dumping, same as
    "--trace-threads".
 
 .. describe:: TOP_MODULE
diff --git a/docs/internals.rst b/docs/internals.rst
index cf6b05d1e..104f18503 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -595,7 +595,7 @@ path through the graph is the sum of macro-task execution costs. Sarkar
 does almost the same thing, except that he has nonzero estimates for
 synchronization costs.
 
-Verilator's cost estimates are assigned by ``InstrCountCostVisitor``.  This
+Verilator's cost estimates are assigned by ``InstrCountVisitor``.  This
 class is perhaps the most fragile piece of the multithread
 implementation. It's easy to have a bug where you count something cheap
 (eg. accessing one element of a huge array) as if it were expensive (eg.
diff --git a/docs/spelling.txt b/docs/spelling.txt
index 9014a6af6..0e423ba26 100644
--- a/docs/spelling.txt
+++ b/docs/spelling.txt
@@ -683,6 +683,7 @@ onehot
 ooo
 oprofile
 oversubscription
+parallelized
 param
 parameterized
 params
@@ -771,6 +772,7 @@ specparam
 splitme
 spp
 sqrt
+src
 srcdir
 srcfile
 sscanf
@@ -889,6 +891,7 @@ writeme
 writemem
 writememb
 writememh
+xiak
 xin
 xml
 xnor
diff --git a/examples/cmake_tracing_c/CMakeLists.txt b/examples/cmake_tracing_c/CMakeLists.txt
index 522c20cc5..95fb3dfb2 100644
--- a/examples/cmake_tracing_c/CMakeLists.txt
+++ b/examples/cmake_tracing_c/CMakeLists.txt
@@ -33,5 +33,5 @@ add_executable(example ../make_tracing_c/sim_main.cpp)
 # Add the Verilated circuit to the target
 verilate(example COVERAGE TRACE
   INCLUDE_DIRS "../make_tracing_c"
-  VERILATOR_ARGS -f ../make_tracing_c/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_c/input.vc -x-assign fast
   SOURCES ../make_tracing_c/top.v)
diff --git a/examples/cmake_tracing_sc/CMakeLists.txt b/examples/cmake_tracing_sc/CMakeLists.txt
index 4651d1709..0d67a8cf5 100644
--- a/examples/cmake_tracing_sc/CMakeLists.txt
+++ b/examples/cmake_tracing_sc/CMakeLists.txt
@@ -45,7 +45,7 @@ set_property(
 # Add the Verilated circuit to the target
 verilate(example SYSTEMC COVERAGE TRACE
   INCLUDE_DIRS "../make_tracing_sc"
-  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -Os -x-assign 0
+  VERILATOR_ARGS -f ../make_tracing_sc/input.vc -x-assign fast
   SOURCES ../make_tracing_sc/top.v)
 
 verilator_link_systemc(example)
diff --git a/examples/make_protect_lib/Makefile b/examples/make_protect_lib/Makefile
index 215df0396..359ece33e 100644
--- a/examples/make_protect_lib/Makefile
+++ b/examples/make_protect_lib/Makefile
@@ -33,7 +33,7 @@ VERILATOR_FLAGS =
 # Generate C++
 VERILATOR_FLAGS += -cc
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # This example does not use vl_time_stamp but rather
diff --git a/examples/make_tracing_c/Makefile b/examples/make_tracing_c/Makefile
index be77c71e4..e7dcaf244 100644
--- a/examples/make_tracing_c/Makefile
+++ b/examples/make_tracing_c/Makefile
@@ -36,7 +36,7 @@ VERILATOR_FLAGS += -cc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms
diff --git a/examples/make_tracing_sc/Makefile b/examples/make_tracing_sc/Makefile
index 80a6221b2..5f90a5ebf 100644
--- a/examples/make_tracing_sc/Makefile
+++ b/examples/make_tracing_sc/Makefile
@@ -37,7 +37,7 @@ VERILATOR_FLAGS += -sc --exe
 # Generate makefile dependencies (not shown as complicates the Makefile)
 #VERILATOR_FLAGS += -MMD
 # Optimize
-VERILATOR_FLAGS += -Os -x-assign 0
+VERILATOR_FLAGS += -x-assign fast
 # Warn abount lint issues; may not want this on less solid designs
 VERILATOR_FLAGS += -Wall
 # Make waveforms
diff --git a/include/verilated.h b/include/verilated.h
index 804d7363a..f9cf79601 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -147,7 +147,7 @@ extern uint32_t VL_THREAD_ID() VL_MT_SAFE;
 
 #if VL_THREADED
 
-#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before relaxing
+#define VL_LOCK_SPINS 50000  /// Number of times to spin for a mutex before yielding
 
 /// Mutex, wrapped to allow -fthread_safety checks
 class VL_CAPABILITY("mutex") VerilatedMutex final {
diff --git a/include/verilated_fst_c.cpp b/include/verilated_fst_c.cpp
index 68431db71..0bc1048cf 100644
--- a/include/verilated_fst_c.cpp
+++ b/include/verilated_fst_c.cpp
@@ -83,9 +83,11 @@ static_assert(static_cast<int>(FST_ST_VCD_PROGRAM) == static_cast<int>(VLT_TRACE
 //=============================================================================
 // Specialization of the generics for this trace format
 
-#define VL_DERIVED_T VerilatedFst
-#include "verilated_trace_imp.cpp"
-#undef VL_DERIVED_T
+#define VL_SUB_T VerilatedFst
+#define VL_BUF_T VerilatedFstBuffer
+#include "verilated_trace_imp.h"
+#undef VL_SUB_T
+#undef VL_BUF_T
 
 //=============================================================================
 // VerilatedFst
@@ -111,7 +113,7 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {
 
     m_curScope.clear();
 
-    VerilatedTrace<VerilatedFst>::traceInit();
+    Super::traceInit();
 
     // Clear the scope stack
     auto it = m_curScope.begin();
@@ -133,14 +135,14 @@ void VerilatedFst::open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex) {
 
 void VerilatedFst::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::closeBase();
+    Super::closeBase();
     fstWriterClose(m_fst);
     m_fst = nullptr;
 }
 
 void VerilatedFst::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedFst>::flushBase();
+    Super::flushBase();
     fstWriterFlushContext(m_fst);
 }
 
@@ -162,7 +164,7 @@ void VerilatedFst::declare(uint32_t code, const char* name, int dtypenum, fstVar
                            int lsb) {
     const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;
 
-    const bool enabled = VerilatedTrace<VerilatedFst>::declCode(code, name, bits, false);
+    const bool enabled = Super::declCode(code, name, bits, false);
     if (!enabled) return;
 
     std::string nameasstr = namePrefix() + name;
@@ -245,18 +247,42 @@ void VerilatedFst::declDouble(uint32_t code, const char* name, int dtypenum, fst
     declare(code, name, dtypenum, vardir, vartype, array, arraynum, false, 63, 0);
 }
 
+//=============================================================================
+// Get/commit trace buffer
+
+VerilatedFstBuffer* VerilatedFst::getTraceBuffer() { return new VerilatedFstBuffer{*this}; }
+
+void VerilatedFst::commitTraceBuffer(VerilatedFstBuffer* bufp) {
+#ifdef VL_TRACE_OFFLOAD
+    if (bufp->m_offloadBufferWritep) {
+        m_offloadBufferWritep = bufp->m_offloadBufferWritep;
+        return;  // Buffer will be deleted by the offload thread
+    }
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedFstBuffer implementation
+
+VerilatedFstBuffer::VerilatedFstBuffer(VerilatedFst& owner)
+    : VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer>{owner} {}
+
+//=============================================================================
+// Trace rendering primitives
+
 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitBit(uint32_t code, CData newval) {
+void VerilatedFstBuffer::emitBit(uint32_t code, CData newval) {
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     fstWriterEmitValueChange(m_fst, m_symbolp[code], newval ? "1" : "0");
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedFstBuffer::emitCData(uint32_t code, CData newval, int bits) {
     char buf[VL_BYTESIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtCDataToStr(buf, newval << (VL_BYTESIZE - bits));
@@ -264,7 +290,7 @@ void VerilatedFst::emitCData(uint32_t code, CData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedFstBuffer::emitSData(uint32_t code, SData newval, int bits) {
     char buf[VL_SHORTSIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtSDataToStr(buf, newval << (VL_SHORTSIZE - bits));
@@ -272,7 +298,7 @@ void VerilatedFst::emitSData(uint32_t code, SData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedFstBuffer::emitIData(uint32_t code, IData newval, int bits) {
     char buf[VL_IDATASIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtIDataToStr(buf, newval << (VL_IDATASIZE - bits));
@@ -280,7 +306,7 @@ void VerilatedFst::emitIData(uint32_t code, IData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedFstBuffer::emitQData(uint32_t code, QData newval, int bits) {
     char buf[VL_QUADSIZE];
     VL_DEBUG_IFDEF(assert(m_symbolp[code]););
     cvtQDataToStr(buf, newval << (VL_QUADSIZE - bits));
@@ -288,7 +314,7 @@ void VerilatedFst::emitQData(uint32_t code, QData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedFstBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
     int words = VL_WORDS_I(bits);
     char* wp = m_strbuf;
     // Convert the most significant word
@@ -304,6 +330,6 @@ void VerilatedFst::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedFst::emitDouble(uint32_t code, double newval) {
+void VerilatedFstBuffer::emitDouble(uint32_t code, double newval) {
     fstWriterEmitValueChange(m_fst, m_symbolp[code], &newval);
 }
diff --git a/include/verilated_fst_c.h b/include/verilated_fst_c.h
index b622a1894..5131cc8cc 100644
--- a/include/verilated_fst_c.h
+++ b/include/verilated_fst_c.h
@@ -31,15 +31,19 @@
 #include <string>
 #include <vector>
 
+class VerilatedFstBuffer;
+
 //=============================================================================
 // VerilatedFst
 // Base class to create a Verilator FST dump
 // This is an internally used class - see VerilatedFstC for what to call from applications
 
-class VerilatedFst final : public VerilatedTrace<VerilatedFst> {
+class VerilatedFst final : public VerilatedTrace<VerilatedFst, VerilatedFstBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedFst, VerilatedFstBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedFst>;
+    friend Buffer;  // Give the buffer access to the private bits
 
     //=========================================================================
     // FST specific internals
@@ -60,31 +64,26 @@ protected:
     //=========================================================================
     // Implementation of VerilatedTrace interface
 
-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
     virtual void emitTimeChange(uint64_t timeui) override;
 
     // Hooks called from VerilatedTrace
     virtual bool preFullDump() override { return isOpen(); }
     virtual bool preChangeDump() override { return isOpen(); }
 
-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedFstBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedFstBuffer*) override;
 
 public:
     //=========================================================================
     // External interface to client code
-    // (All must be threadsafe)
 
+    // CONSTRUCTOR
     explicit VerilatedFst(void* fst = nullptr);
     ~VerilatedFst();
 
+    // METHODS - All must be thread safe
     // Open the file; call isOpen() to see if errors
     void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
     // Close the file
@@ -97,11 +96,6 @@ public:
     //=========================================================================
     // Internal interface to Verilator generated code
 
-    // Inside dumping routines, declare a data type
-    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
-                       const char** itemNamesp, const char** itemValuesp);
-
-    // Inside dumping routines, declare a signal
     void declBit(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                  fstVarType vartype, bool array, int arraynum);
     void declBus(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
@@ -112,18 +106,55 @@ public:
                    fstVarType vartype, bool array, int arraynum, int msb, int lsb);
     void declDouble(uint32_t code, const char* name, int dtypenum, fstVarDir vardir,
                     fstVarType vartype, bool array, int arraynum);
+
+    void declDTypeEnum(int dtypenum, const char* name, uint32_t elements, unsigned int minValbits,
+                       const char** itemNamesp, const char** itemValuesp);
 };
 
 #ifndef DOXYGEN
 // Declare specialization here as it's used in VerilatedFstC just below
-template <> void VerilatedTrace<VerilatedFst>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedFst>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedFst>::dumpvars(int level, const std::string& hier);
+template <> void VerilatedFst::Super::dump(uint64_t time);
+template <> void VerilatedFst::Super::set_time_unit(const char* unitp);
+template <> void VerilatedFst::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedFst::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedFst::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedFst::Super::dumpvars(int level, const std::string& hier);
 #endif
 
+//=============================================================================
+// VerilatedFstBuffer
+
+class VerilatedFstBuffer final : public VerilatedTraceBuffer<VerilatedFst, VerilatedFstBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedFst;
+    friend VerilatedFst::Super;
+
+    // The FST file handle
+    void* const m_fst = m_owner.m_fst;
+    // code to fstHande map, as an array
+    const fstHandle* const m_symbolp = m_owner.m_symbolp;
+    // String buffer long enough to hold maxBits() chars
+    char* const m_strbuf = m_owner.m_strbuf;
+
+public:
+    // CONSTRUCTOR
+    explicit VerilatedFstBuffer(VerilatedFst& owner);
+    ~VerilatedFstBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
 //=============================================================================
 // VerilatedFstC
 /// Create a FST dump file in C standalone (no SystemC) simulations.
diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp
index 1a5f16a36..ed25093d1 100644
--- a/include/verilated_profiler.cpp
+++ b/include/verilated_profiler.cpp
@@ -60,7 +60,7 @@ uint16_t VlExecutionRecord::getcpu() {
 //=============================================================================
 // VlExecutionProfiler implementation
 
-template <size_t N> size_t roundUptoMultipleOf(size_t value) {
+template <size_t N> static size_t roundUptoMultipleOf(size_t value) {
     static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
     size_t mask = N - 1;
     return (value + mask) & ~mask;
diff --git a/include/verilated_trace.h b/include/verilated_trace.h
index a88ce6b50..7915c3645 100644
--- a/include/verilated_trace.h
+++ b/include/verilated_trace.h
@@ -22,28 +22,43 @@
 #ifndef VERILATOR_VERILATED_TRACE_H_
 #define VERILATOR_VERILATED_TRACE_H_
 
-#ifdef VL_TRACE_THREADED
-#define VL_TRACE_OFFLOAD
+// clang-format off
+
+// In FST mode, VL_TRACE_THREADED enables offloading, but only if we also have
+// the FST writer thread. This means with --trace-threads 1, we get the FST
+// writer thread only, and with --trace-threads 2 we get offloading as well
+#if defined(VL_TRACE_FST_WRITER_THREAD) && defined(VL_TRACE_THREADED)
+# define VL_TRACE_OFFLOAD
+#endif
+// VCD tracing can happen fully in parallel
+#if defined(VM_TRACE_VCD) && VM_TRACE_VCD && defined(VL_TRACE_THREADED)
+# define VL_TRACE_PARALLEL
 #endif
 
-// clang-format off
+#if defined(VL_TRACE_PARALLEL) && defined(VL_TRACE_OFFLOAD)
+# error "Cannot have VL_TRACE_PARALLEL and VL_TRACE_OFFLOAD together"
+#endif
 
 #include "verilated.h"
 #include "verilated_trace_defs.h"
 
 #include <bitset>
+#include <condition_variable>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #ifdef VL_TRACE_OFFLOAD
-# include <condition_variable>
 # include <deque>
 # include <thread>
 #endif
 
 // clang-format on
 
+class VlThreadPool;
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer;
+
 #ifdef VL_TRACE_OFFLOAD
 //=============================================================================
 // Offloaded tracing
@@ -106,7 +121,8 @@ public:
         CHG_WDATA = 0x6,
         CHG_DOUBLE = 0x8,
         // TODO: full..
-        TIME_CHANGE = 0xd,
+        TIME_CHANGE = 0xc,
+        TRACE_BUFFER = 0xd,
         END = 0xe,  // End of buffer
         SHUTDOWN = 0xf  // Shutdown worker thread, also marks end of buffer
     };
@@ -116,16 +132,22 @@ public:
 //=============================================================================
 // VerilatedTrace
 
-// VerilatedTrace uses F-bounded polymorphism to access duck-typed
-// implementations in the format specific derived class, which must be passed
-// as the type parameter T_Derived
-template <class T_Derived> class VerilatedTrace VL_NOT_FINAL {
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+template <class T_Trace, class T_Buffer> class VerilatedTrace VL_NOT_FINAL {
+    // Give the buffer (both base and derived) access to the private bits
+    friend VerilatedTraceBuffer<T_Trace, T_Buffer>;
+    friend T_Buffer;
+
 public:
+    using Buffer = T_Buffer;
+
     //=========================================================================
     // Generic tracing internals
 
-    using initCb_t = void (*)(void*, T_Derived*, uint32_t);  // Type of init callbacks
-    using dumpCb_t = void (*)(void*, T_Derived*);  // Type of all but init callbacks
+    using initCb_t = void (*)(void*, T_Trace*, uint32_t);  // Type of init callbacks
+    using dumpCb_t = void (*)(void*, Buffer*);  // Type of dump callbacks
+    using cleanupCb_t = void (*)(void*, T_Trace*);  // Type of cleanup callbacks
 
 private:
     struct CallbackRecord {
@@ -133,9 +155,10 @@ private:
         // (the one in Ubuntu 14.04 with GCC 4.8.4 in particular) use the
         // assignment operator on inserting into collections, so they don't work
         // with const fields...
-        union {
-            initCb_t m_initCb;  // The callback function
-            dumpCb_t m_dumpCb;  // The callback function
+        union {  // The callback
+            initCb_t m_initCb;
+            dumpCb_t m_dumpCb;
+            cleanupCb_t m_cleanupCb;
         };
         void* m_userp;  // The user pointer to pass to the callback (the symbol table)
         CallbackRecord(initCb_t cb, void* userp)
@@ -144,32 +167,66 @@ private:
         CallbackRecord(dumpCb_t cb, void* userp)
             : m_dumpCb{cb}
             , m_userp{userp} {}
+        CallbackRecord(cleanupCb_t cb, void* userp)
+            : m_cleanupCb{cb}
+            , m_userp{userp} {}
     };
 
-    uint32_t* m_sigs_oldvalp;  // Old value store
-    EData* m_sigs_enabledp;  // Bit vector of enabled codes (nullptr = all on)
-    uint64_t m_timeLastDump;  // Last time we did a dump
+#ifdef VL_TRACE_PARALLEL
+    struct ParallelWorkerData {
+        const dumpCb_t m_cb;  // The callback
+        void* const m_userp;  // The use pointer to pass to the callback
+        Buffer* const m_bufp;  // The buffer pointer to pass to the callback
+        std::atomic<bool> m_ready{false};  // The ready flag
+        mutable VerilatedMutex m_mutex;  // Mutex for suspension until ready
+        std::condition_variable_any m_cv;  // Condition variable for suspension
+        bool m_waiting VL_GUARDED_BY(m_mutex) = false;  // Whether a thread is suspended in wait()
+
+        void wait();
+
+        ParallelWorkerData(dumpCb_t cb, void* userp, Buffer* bufp)
+            : m_cb{cb}
+            , m_userp{userp}
+            , m_bufp{bufp} {}
+    };
+
+    // Passed a ParallelWorkerData*, second argument is ignored
+    static void parallelWorkerTask(void*, bool);
+#endif
+
+    using ParallelCallbackMap = std::unordered_map<VlThreadPool*, std::vector<CallbackRecord>>;
+
+protected:
+    uint32_t* m_sigs_oldvalp = nullptr;  // Previous value store
+    EData* m_sigs_enabledp = nullptr;  // Bit vector of enabled codes (nullptr = all on)
+private:
+    uint64_t m_timeLastDump = 0;  // Last time we did a dump
     std::vector<bool> m_sigs_enabledVec;  // Staging for m_sigs_enabledp
-    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize traciong
-    std::vector<CallbackRecord> m_fullCbs;  // Routines to perform full dump
-    std::vector<CallbackRecord> m_chgCbs;  // Routines to perform incremental dump
+    std::vector<CallbackRecord> m_initCbs;  // Routines to initialize tracing
+    ParallelCallbackMap m_fullCbs;  // Routines to perform full dump
+    ParallelCallbackMap m_chgCbs;  // Routines to perform incremental dump
     std::vector<CallbackRecord> m_cleanupCbs;  // Routines to call at the end of dump
-    bool m_fullDump;  // Whether a full dump is required on the next call to 'dump'
-    uint32_t m_nextCode;  // Next code number to assign
-    uint32_t m_numSignals;  // Number of distinct signals
-    uint32_t m_maxBits;  // Number of bits in the widest signal
+    std::vector<VlThreadPool*> m_threadPoolps;  // All thread pools, in insertion order
+    bool m_fullDump = true;  // Whether a full dump is required on the next call to 'dump'
+    uint32_t m_nextCode = 0;  // Next code number to assign
+    uint32_t m_numSignals = 0;  // Number of distinct signals
+    uint32_t m_maxBits = 0;  // Number of bits in the widest signal
     std::vector<std::string> m_namePrefixStack{""};  // Path prefixes to add to signal names
     std::vector<std::pair<int, std::string>> m_dumpvars;  // dumpvar() entries
-    char m_scopeEscape;
-    double m_timeRes;  // Time resolution (ns/ms etc)
-    double m_timeUnit;  // Time units (ns/ms etc)
+    char m_scopeEscape = '.';
+    double m_timeRes = 1e-9;  // Time resolution (ns/ms etc)
+    double m_timeUnit = 1e-0;  // Time units (ns/ms etc)
+
+    void addThreadPool(VlThreadPool* threadPoolp) VL_MT_SAFE_EXCLUDES(m_mutex);
 
     void addCallbackRecord(std::vector<CallbackRecord>& cbVec, CallbackRecord& cbRec)
         VL_MT_SAFE_EXCLUDES(m_mutex);
 
-    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // Equivalent to 'this' but is of the sub-type 'T_Trace*'. Use 'self()->'
     // to access duck-typed functions to avoid a virtual function call.
-    T_Derived* self() { return static_cast<T_Derived*>(this); }
+    T_Trace* self() { return static_cast<T_Trace*>(this); }
+
+    void runParallelCallbacks(const ParallelCallbackMap& cbMap);
 
     // Flush any remaining data for this file
     static void onFlush(void* selfp) VL_MT_UNSAFE_ONE;
@@ -178,17 +235,21 @@ private:
 
 #ifdef VL_TRACE_OFFLOAD
     // Number of total offload buffers that have been allocated
-    uint32_t m_numOffloadBuffers;
+    uint32_t m_numOffloadBuffers = 0;
     // Size of offload buffers
-    size_t m_offloadBufferSize;
+    size_t m_offloadBufferSize = 0;
     // Buffers handed to worker for processing
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersToWorker;
     // Buffers returned from worker after processing
     VerilatedThreadQueue<uint32_t*> m_offloadBuffersFromWorker;
+
+protected:
     // Write pointer into current buffer
-    uint32_t* m_offloadBufferWritep;
+    uint32_t* m_offloadBufferWritep = nullptr;
     // End of offload buffer
-    uint32_t* m_offloadBufferEndp;
+    uint32_t* m_offloadBufferEndp = nullptr;
+
+private:
     // The offload worker thread itself
     std::unique_ptr<std::thread> m_workerThread;
 
@@ -250,6 +311,10 @@ protected:
     virtual bool preFullDump() = 0;
     virtual bool preChangeDump() = 0;
 
+    // Trace buffer management
+    virtual Buffer* getTraceBuffer() = 0;
+    virtual void commitTraceBuffer(Buffer*) = 0;
+
 public:
     //=========================================================================
     // External interface to client code
@@ -270,19 +335,55 @@ public:
     // Call
     void dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex);
 
+    //=========================================================================
+    // Internal interface to Verilator generated code
+
     //=========================================================================
     // Non-hot path internal interface to Verilator generated code
 
     void addInitCb(initCb_t cb, void* userp) VL_MT_SAFE;
-    void addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
-    void addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE;
+    void addFullCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addChgCb(dumpCb_t cb, void* userp, VlThreadPool* = nullptr) VL_MT_SAFE;
+    void addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE;
 
     void scopeEscape(char flag) { m_scopeEscape = flag; }
 
     void pushNamePrefix(const std::string&);
     void popNamePrefix(unsigned count = 1);
+};
 
+//=============================================================================
+// VerilatedTraceBuffer
+
+// T_Trace is the format specific subclass of VerilatedTrace.
+// T_Buffer is the format specific subclass of VerilatedTraceBuffer.
+// The format-specific hot-path methods use duck-typing via T_Buffer for performance.
+template <class T_Trace, class T_Buffer> class VerilatedTraceBuffer VL_NOT_FINAL {
+    friend T_Trace;  // Give the trace file access to the private bits
+
+protected:
+    T_Trace& m_owner;  // The VerilatedTrace subclass that owns this buffer
+
+    // Previous value store
+    uint32_t* const m_sigs_oldvalp = m_owner.m_sigs_oldvalp;
+    // Bit vector of enabled codes (nullptr = all on)
+    EData* const m_sigs_enabledp = m_owner.m_sigs_enabledp;
+
+#ifdef VL_TRACE_OFFLOAD
+    // Write pointer into current buffer
+    uint32_t* m_offloadBufferWritep = m_owner.m_offloadBufferWritep;
+    // End of offload buffer
+    uint32_t* const m_offloadBufferEndp = m_owner.m_offloadBufferEndp;
+#endif
+
+    // Equivalent to 'this' but is of the sub-type 'T_Derived*'. Use 'self()->'
+    // to access duck-typed functions to avoid a virtual function call.
+    inline T_Buffer* self() { return static_cast<T_Buffer*>(this); }
+
+    explicit VerilatedTraceBuffer(T_Trace& owner);
+    virtual ~VerilatedTraceBuffer() = default;
+
+public:
     //=========================================================================
     // Hot path internal interface to Verilator generated code
 
@@ -300,7 +401,7 @@ public:
     // duck-typed void emitWData(uint32_t code, const WData* newvalp, int bits) = 0;
     // duck-typed void emitDouble(uint32_t code, double newval) = 0;
 
-    uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }
+    VL_ATTR_ALWINLINE inline uint32_t* oldp(uint32_t code) { return m_sigs_oldvalp + code; }
 
     // Write to previous value buffer value and emit trace entry.
     void fullBit(uint32_t* oldp, CData newval);
@@ -363,9 +464,13 @@ public:
         VL_DEBUG_IF(assert(m_offloadBufferWritep <= m_offloadBufferEndp););
     }
 
-#define CHG(name) chg##name##Impl
-#else
-#define CHG(name) chg##name
+#define chgBit chgBitImpl
+#define chgCData chgCDataImpl
+#define chgSData chgSDataImpl
+#define chgIData chgIDataImpl
+#define chgQData chgQDataImpl
+#define chgWData chgWDataImpl
+#define chgDouble chgDoubleImpl
 #endif
 
     // In non-offload mode, these are called directly by the trace callbacks,
@@ -373,27 +478,27 @@ public:
     // thread and are called chg*Impl
 
     // Check previous dumped value of signal. If changed, then emit trace entry
-    inline void CHG(Bit)(uint32_t* oldp, CData newval) {
+    VL_ATTR_ALWINLINE inline void chgBit(uint32_t* oldp, CData newval) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullBit(oldp, newval);
     }
-    inline void CHG(CData)(uint32_t* oldp, CData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgCData(uint32_t* oldp, CData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullCData(oldp, newval, bits);
     }
-    inline void CHG(SData)(uint32_t* oldp, SData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgSData(uint32_t* oldp, SData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullSData(oldp, newval, bits);
     }
-    inline void CHG(IData)(uint32_t* oldp, IData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgIData(uint32_t* oldp, IData newval, int bits) {
         const uint32_t diff = *oldp ^ newval;
         if (VL_UNLIKELY(diff)) fullIData(oldp, newval, bits);
     }
-    inline void CHG(QData)(uint32_t* oldp, QData newval, int bits) {
+    VL_ATTR_ALWINLINE inline void chgQData(uint32_t* oldp, QData newval, int bits) {
         const uint64_t diff = *reinterpret_cast<QData*>(oldp) ^ newval;
         if (VL_UNLIKELY(diff)) fullQData(oldp, newval, bits);
     }
-    inline void CHG(WData)(uint32_t* oldp, const WData* newvalp, int bits) {
+    VL_ATTR_ALWINLINE inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
         for (int i = 0; i < (bits + 31) / 32; ++i) {
             if (VL_UNLIKELY(oldp[i] ^ newvalp[i])) {
                 fullWData(oldp, newvalp, bits);
@@ -401,11 +506,20 @@ public:
             }
         }
     }
-    inline void CHG(Double)(uint32_t* oldp, double newval) {
+    VL_ATTR_ALWINLINE inline void chgDouble(uint32_t* oldp, double newval) {
         // cppcheck-suppress invalidPointerCast
         if (VL_UNLIKELY(*reinterpret_cast<double*>(oldp) != newval)) fullDouble(oldp, newval);
     }
 
-#undef CHG
+#ifdef VL_TRACE_OFFLOAD
+#undef chgBit
+#undef chgCData
+#undef chgSData
+#undef chgIData
+#undef chgQData
+#undef chgWData
+#undef chgDouble
+#endif
 };
+
 #endif  // guard
diff --git a/include/verilated_trace_imp.cpp b/include/verilated_trace_imp.h
similarity index 71%
rename from include/verilated_trace_imp.cpp
rename to include/verilated_trace_imp.h
index 7a98b7abf..d2ffa965c 100644
--- a/include/verilated_trace_imp.cpp
+++ b/include/verilated_trace_imp.h
@@ -10,26 +10,26 @@
 // SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 //
 //=============================================================================
-///
-/// \file
-/// \brief Verilated common-format tracing implementation code
-///
-/// This file must be compiled and linked against all Verilated objects
-/// that use --trace.
-///
-/// Use "verilator --trace" to add this to the Makefile for the linker.
-///
+//
+// Verilated tracing implementation code template common to all formats.
+// This file is included by the format specific implementations and
+// should not be used otherwise.
+//
 //=============================================================================
 
 // clang-format off
 
 #ifndef VL_CPPCHECK
-#ifndef VL_DERIVED_T
+#if !defined(VL_SUB_T) || !defined(VL_BUF_T)
 # error "This file should be included in trace format implementations"
 #endif
 
 #include "verilated_intrinsics.h"
 #include "verilated_trace.h"
+#ifdef VL_TRACE_PARALLEL
+# include "verilated_threads.h"
+# include <list>
+#endif
 
 #if 0
 # include <iostream>
@@ -82,7 +82,7 @@ static std::string doubleToTimescale(double value) {
 //=========================================================================
 // Buffer management
 
-template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
+template <> uint32_t* VerilatedTrace<VL_SUB_T, VL_BUF_T>::getOffloadBuffer() {
     uint32_t* bufferp;
     // Some jitter is expected, so some number of alternative offlaod buffers are
     // required, but don't allocate more than 8 buffers.
@@ -101,7 +101,7 @@ template <> uint32_t* VerilatedTrace<VL_DERIVED_T>::getOffloadBuffer() {
     return bufferp;
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32_t* buffp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::waitForOffloadBuffer(const uint32_t* buffp) {
     // Slow path code only called on flush/shutdown, so use a simple algorithm.
     // Collect buffers from worker and stash them until we get the one we want.
     std::deque<uint32_t*> stash;
@@ -116,7 +116,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::waitForOffloadBuffer(const uint32
 //=========================================================================
 // Worker thread
 
-template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain() {
     bool shutdown = false;
 
     do {
@@ -127,6 +127,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
 
         const uint32_t* readp = bufferp;
 
+        std::unique_ptr<VL_BUF_T> traceBufp;  // We own the passed tracebuffer
+
         while (true) {
             const uint32_t cmd = readp[0];
             const uint32_t top = cmd >> 4;
@@ -141,44 +143,44 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                 // CHG_* commands
             case VerilatedTraceOffloadCommand::CHG_BIT_0:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_0 " << top);
-                chgBitImpl(oldp, 0);
+                traceBufp->chgBitImpl(oldp, 0);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_BIT_1:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_BIT_1 " << top);
-                chgBitImpl(oldp, 1);
+                traceBufp->chgBitImpl(oldp, 1);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_CDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_CDATA " << top);
                 // Bits stored in bottom byte of command
-                chgCDataImpl(oldp, *readp, top);
+                traceBufp->chgCDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_SDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_SDATA " << top);
                 // Bits stored in bottom byte of command
-                chgSDataImpl(oldp, *readp, top);
+                traceBufp->chgSDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_IDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_IDATA " << top);
                 // Bits stored in bottom byte of command
-                chgIDataImpl(oldp, *readp, top);
+                traceBufp->chgIDataImpl(oldp, *readp, top);
                 readp += 1;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_QDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_QDATA " << top);
                 // Bits stored in bottom byte of command
-                chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
+                traceBufp->chgQDataImpl(oldp, *reinterpret_cast<const QData*>(readp), top);
                 readp += 2;
                 continue;
             case VerilatedTraceOffloadCommand::CHG_WDATA:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_WDATA " << top);
-                chgWDataImpl(oldp, readp, top);
+                traceBufp->chgWDataImpl(oldp, readp, top);
                 readp += VL_WORDS_I(top);
                 continue;
             case VerilatedTraceOffloadCommand::CHG_DOUBLE:
                 VL_TRACE_OFFLOAD_DEBUG("Command CHG_DOUBLE " << top);
-                chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
+                traceBufp->chgDoubleImpl(oldp, *reinterpret_cast<const double*>(readp));
                 readp += 2;
                 continue;
 
@@ -191,9 +193,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
                 readp += 2;
                 continue;
 
+            case VerilatedTraceOffloadCommand::TRACE_BUFFER:
+                VL_TRACE_OFFLOAD_DEBUG("Command TRACE_BUFFER " << top);
+                readp -= 1;  // No code in this command, undo increment
+                traceBufp.reset(*reinterpret_cast<VL_BUF_T* const*>(readp));
+                readp += 2;
+                continue;
+
                 //===
                 // Commands ending this buffer
-            case VerilatedTraceOffloadCommand::END: VL_TRACE_OFFLOAD_DEBUG("Command END"); break;
+            case VerilatedTraceOffloadCommand::END:  //
+                VL_TRACE_OFFLOAD_DEBUG("Command END");
+                break;
             case VerilatedTraceOffloadCommand::SHUTDOWN:
                 VL_TRACE_OFFLOAD_DEBUG("Command SHUTDOWN");
                 shutdown = true;
@@ -202,8 +213,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
             //===
             // Unknown command
             default: {  // LCOV_EXCL_START
-                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN");
-                VL_PRINTF_MT("Trace command: 0x%08x\n", cmd);
+                VL_TRACE_OFFLOAD_DEBUG("Command UNKNOWN " << cmd);
                 VL_FATAL_MT(__FILE__, __LINE__, "", "Unknown trace command");
                 break;
             }  // LCOV_EXCL_STOP
@@ -221,7 +231,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain() {
     } while (VL_LIKELY(!shutdown));
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::shutdownOffloadWorker() {
     // If the worker thread is not running, done..
     if (!m_workerThread) return;
 
@@ -241,7 +251,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::shutdownOffloadWorker() {
 //=============================================================================
 // Life cycle
 
-template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::closeBase() {
 #ifdef VL_TRACE_OFFLOAD
     shutdownOffloadWorker();
     while (m_numOffloadBuffers) {
@@ -251,7 +261,7 @@ template <> void VerilatedTrace<VL_DERIVED_T>::closeBase() {
 #endif
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::flushBase() {
 #ifdef VL_TRACE_OFFLOAD
     // Hand an empty buffer to the worker thread
     uint32_t* const bufferp = getOffloadBuffer();
@@ -266,46 +276,29 @@ template <> void VerilatedTrace<VL_DERIVED_T>::flushBase() {
 //=============================================================================
 // Callbacks to run on global events
 
-template <> void VerilatedTrace<VL_DERIVED_T>::onFlush(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush(void* selfp) {
     // This calls 'flush' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->flush();
+    reinterpret_cast<VL_SUB_T*>(selfp)->flush();
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::onExit(void* selfp) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit(void* selfp) {
     // This calls 'close' on the derived class (which must then get any mutex)
-    reinterpret_cast<VL_DERIVED_T*>(selfp)->close();
+    reinterpret_cast<VL_SUB_T*>(selfp)->close();
 }
 
 //=============================================================================
 // VerilatedTrace
 
-template <>
-VerilatedTrace<VL_DERIVED_T>::VerilatedTrace()
-    : m_sigs_oldvalp{nullptr}
-    , m_sigs_enabledp{nullptr}
-    , m_timeLastDump{0}
-    , m_fullDump{true}
-    , m_nextCode{0}
-    , m_numSignals{0}
-    , m_maxBits{0}
-    , m_scopeEscape{'.'}
-    , m_timeRes{1e-9}
-    , m_timeUnit {
-    1e-9
-}
-#ifdef VL_TRACE_OFFLOAD
-, m_numOffloadBuffers { 0 }
-#endif
-{
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::VerilatedTrace() {
     set_time_unit(Verilated::threadContextp()->timeunitString());
     set_time_resolution(Verilated::threadContextp()->timeprecisionString());
 }
 
-template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
+template <> VerilatedTrace<VL_SUB_T, VL_BUF_T>::~VerilatedTrace() {
     if (m_sigs_oldvalp) VL_DO_CLEAR(delete[] m_sigs_oldvalp, m_sigs_oldvalp = nullptr);
     if (m_sigs_enabledp) VL_DO_CLEAR(delete[] m_sigs_enabledp, m_sigs_enabledp = nullptr);
-    Verilated::removeFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::removeExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::removeFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::removeExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
 #ifdef VL_TRACE_OFFLOAD
     closeBase();
 #endif
@@ -314,7 +307,7 @@ template <> VerilatedTrace<VL_DERIVED_T>::~VerilatedTrace() {
 //=========================================================================
 // Internals available to format specific implementations
 
-template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::traceInit() VL_MT_UNSAFE {
     // Note: It is possible to re-open a trace file (VCD in particular),
     // so we must reset the next code here, but it must have the same number
     // of codes on re-open
@@ -359,8 +352,8 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
     }
 
     // Set callback so flush/abort will flush this file
-    Verilated::addFlushCb(VerilatedTrace<VL_DERIVED_T>::onFlush, this);
-    Verilated::addExitCb(VerilatedTrace<VL_DERIVED_T>::onExit, this);
+    Verilated::addFlushCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onFlush, this);
+    Verilated::addExitCb(VerilatedTrace<VL_SUB_T, VL_BUF_T>::onExit, this);
 
 #ifdef VL_TRACE_OFFLOAD
     // Compute offload buffer size. we need to be able to store a new value for
@@ -372,13 +365,13 @@ template <> void VerilatedTrace<VL_DERIVED_T>::traceInit() VL_MT_UNSAFE {
 
     // Start the worker thread
     m_workerThread.reset(
-        new std::thread{&VerilatedTrace<VL_DERIVED_T>::offloadWorkerThreadMain, this});
+        new std::thread{&VerilatedTrace<VL_SUB_T, VL_BUF_T>::offloadWorkerThreadMain, this});
 #endif
 }
 
 template <>
-bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
-                                            bool tri) {
+bool VerilatedTrace<VL_SUB_T, VL_BUF_T>::declCode(uint32_t code, const char* namep, uint32_t bits,
+                                                  bool tri) {
     if (VL_UNCOVERABLE(!code)) {
         VL_FATAL_MT(__FILE__, __LINE__, "", "Internal: internal trace problem, code 0 is illegal");
     }
@@ -422,28 +415,30 @@ bool VerilatedTrace<VL_DERIVED_T>::declCode(uint32_t code, const char* namep, ui
 //=========================================================================
 // Internals available to format specific implementations
 
-template <> std::string VerilatedTrace<VL_DERIVED_T>::timeResStr() const {
+template <> std::string VerilatedTrace<VL_SUB_T, VL_BUF_T>::timeResStr() const {
     return doubleToTimescale(m_timeRes);
 }
 
 //=========================================================================
 // External interface to client code
 
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const char* unitp) VL_MT_SAFE {
     m_timeUnit = timescaleToDouble(unitp);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_unit(const std::string& unit) VL_MT_SAFE {
     set_time_unit(unit.c_str());
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const char* unitp) VL_MT_SAFE {
     m_timeRes = timescaleToDouble(unitp);
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::set_time_resolution(const std::string& unit) VL_MT_SAFE {
     set_time_resolution(unit.c_str());
 }
 template <>
-void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dumpvars(int level, const std::string& hier) VL_MT_SAFE {
     if (level == 0) {
         m_dumpvars.clear();  // empty = everything on
     } else {
@@ -456,7 +451,87 @@ void VerilatedTrace<VL_DERIVED_T>::dumpvars(int level, const std::string& hier)
     }
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
+#ifdef VL_TRACE_PARALLEL
+template <>  //
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::parallelWorkerTask(void* datap, bool) {
+    ParallelWorkerData* const wdp = reinterpret_cast<ParallelWorkerData*>(datap);
+    // Run the task
+    wdp->m_cb(wdp->m_userp, wdp->m_bufp);
+    // Mark buffer as ready
+    const VerilatedLockGuard lock{wdp->m_mutex};
+    wdp->m_ready.store(true);
+    if (wdp->m_waiting) wdp->m_cv.notify_one();
+}
+
+template <> VL_ATTR_NOINLINE void VerilatedTrace<VL_SUB_T, VL_BUF_T>::ParallelWorkerData::wait() {
+    // Spin for a while, waiting for the buffer to become ready
+    for (int i = 0; i < VL_LOCK_SPINS; ++i) {
+        if (VL_LIKELY(m_ready.load(std::memory_order_relaxed))) return;
+        VL_CPU_RELAX();
+    }
+    // We have been spinning for a while, so yield the thread
+    VerilatedLockGuard lock{m_mutex};
+    m_waiting = true;
+    m_cv.wait(lock, [this] { return m_ready.load(std::memory_order_relaxed); });
+    m_waiting = false;
+}
+#endif
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::runParallelCallbacks(const ParallelCallbackMap& cbMap) {
+    for (VlThreadPool* threadPoolp : m_threadPoolps) {
+#ifdef VL_TRACE_PARALLEL
+        // If tracing in parallel, dispatch to the thread pool (if exists)
+        if (threadPoolp && threadPoolp->numThreads()) {
+            // List of work items for thread (std::list, as ParallelWorkerData is not movable)
+            std::list<ParallelWorkerData> workerData;
+            // We use the whole pool + the main thread
+            const unsigned threads = threadPoolp->numThreads() + 1;
+            // Main thread executes all jobs with index % threads == 0
+            std::vector<ParallelWorkerData*> mainThreadWorkerData;
+            // The tracing callbacks to execute on this thread-pool
+            const auto& cbVec = cbMap.at(threadPoolp);
+            // Enuque all the jobs
+            for (unsigned i = 0; i < cbVec.size(); ++i) {
+                const CallbackRecord& cbr = cbVec[i];
+                // Always get the trace buffer on the main thread
+                Buffer* const bufp = getTraceBuffer();
+                // Create new work item
+                workerData.emplace_back(cbr.m_dumpCb, cbr.m_userp, bufp);
+                // Grab the new work item
+                ParallelWorkerData* const itemp = &workerData.back();
+                // Enqueue task to thread pool, or main thread
+                if (unsigned rem = i % threads) {
+                    threadPoolp->workerp(rem - 1)->addTask(parallelWorkerTask, itemp, false);
+                } else {
+                    mainThreadWorkerData.push_back(itemp);
+                }
+            }
+            // Execute main thead jobs
+            for (ParallelWorkerData* const itemp : mainThreadWorkerData) {
+                parallelWorkerTask(itemp, false);
+            }
+            // Commit all trace buffers in order
+            for (ParallelWorkerData& item : workerData) {
+                // Wait until ready
+                item.wait();
+                // Commit the buffer
+                commitTraceBuffer(item.m_bufp);
+            }
+            continue;
+        }
+#endif
+        // Fall back on sequential execution
+        for (const CallbackRecord& cbr : cbMap.at(threadPoolp)) {
+            Buffer* const traceBufferp = getTraceBuffer();
+            cbr.m_dumpCb(cbr.m_userp, traceBufferp);
+            commitTraceBuffer(traceBufferp);
+        }
+    }
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::dump(uint64_t timeui) VL_MT_SAFE_EXCLUDES(m_mutex) {
     // Not really VL_MT_SAFE but more VL_MT_UNSAFE_ONE.
     // This does get the mutex, but if multiple threads are trying to dump
     // chances are the data being dumped will have other problems
@@ -504,20 +579,14 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
     // Run the callbacks
     if (VL_UNLIKELY(m_fullDump)) {
         m_fullDump = false;  // No more need for next dump to be full
-        for (uint32_t i = 0; i < m_fullCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_fullCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_fullCbs);
     } else {
-        for (uint32_t i = 0; i < m_chgCbs.size(); ++i) {
-            const CallbackRecord& cbr = m_chgCbs[i];
-            cbr.m_dumpCb(cbr.m_userp, self());
-        }
+        runParallelCallbacks(m_chgCbs);
     }
 
     for (uint32_t i = 0; i < m_cleanupCbs.size(); ++i) {
         const CallbackRecord& cbr = m_cleanupCbs[i];
-        cbr.m_dumpCb(cbr.m_userp, self());
+        cbr.m_cleanupCb(cbr.m_userp, self());
     }
 
 #ifdef VL_TRACE_OFFLOAD
@@ -538,8 +607,18 @@ template <> void VerilatedTrace<VL_DERIVED_T>::dump(uint64_t timeui) VL_MT_SAFE_
 // Non-hot path internal interface to Verilator generated code
 
 template <>
-void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
-                                                     CallbackRecord& cbRec)
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addThreadPool(VlThreadPool* threadPoolp)
+    VL_MT_SAFE_EXCLUDES(m_mutex) {
+    const VerilatedLockGuard lock{m_mutex};
+    for (VlThreadPool* const poolp : m_threadPoolps) {
+        if (poolp == threadPoolp) return;
+    }
+    m_threadPoolps.push_back(threadPoolp);
+}
+
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCallbackRecord(std::vector<CallbackRecord>& cbVec,
+                                                           CallbackRecord& cbRec)
     VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
     if (VL_UNCOVERABLE(timeLastDump() != 0)) {  // LCOV_EXCL_START
@@ -550,91 +629,40 @@ void VerilatedTrace<VL_DERIVED_T>::addCallbackRecord(std::vector<CallbackRecord>
     cbVec.push_back(cbRec);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addInitCb(initCb_t cb, void* userp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
     addCallbackRecord(m_initCbs, cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addFullCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addFullCb(dumpCb_t cb, void* userp,
+                                                   VlThreadPool* threadPoolp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_fullCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_fullCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addChgCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addChgCb(dumpCb_t cb, void* userp,
+                                                  VlThreadPool* threadPoolp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
-    addCallbackRecord(m_chgCbs, cbr);
+    addThreadPool(threadPoolp);
+    addCallbackRecord(m_chgCbs[threadPoolp], cbr);
 }
-template <> void VerilatedTrace<VL_DERIVED_T>::addCleanupCb(dumpCb_t cb, void* userp) VL_MT_SAFE {
+template <>
+void VerilatedTrace<VL_SUB_T, VL_BUF_T>::addCleanupCb(cleanupCb_t cb, void* userp) VL_MT_SAFE {
     CallbackRecord cbr{cb, userp};
     addCallbackRecord(m_cleanupCbs, cbr);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::pushNamePrefix(const std::string& prefix) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::pushNamePrefix(const std::string& prefix) {
     m_namePrefixStack.push_back(m_namePrefixStack.back() + prefix);
 }
 
-template <> void VerilatedTrace<VL_DERIVED_T>::popNamePrefix(unsigned count) {
+template <> void VerilatedTrace<VL_SUB_T, VL_BUF_T>::popNamePrefix(unsigned count) {
     while (count--) m_namePrefixStack.pop_back();
     assert(!m_namePrefixStack.empty());
 }
 
-//=========================================================================
-// Hot path internal interface to Verilator generated code
-
-// These functions must write the new value back into the old value store,
-// and subsequently call the format specific emit* implementations. Note
-// that this file must be included in the format specific implementation, so
-// the emit* functions can be inlined for performance.
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullBit(uint32_t* oldp, CData newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitBit(code, newval);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitCData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitSData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitIData(code, newval, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<QData*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitQData(code, newval, bits);
-}
-
-template <>
-void VerilatedTrace<VL_DERIVED_T>::fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    self()->emitWData(code, newvalp, bits);
-}
-
-template <> void VerilatedTrace<VL_DERIVED_T>::fullDouble(uint32_t* oldp, double newval) {
-    const uint32_t code = oldp - m_sigs_oldvalp;
-    *reinterpret_cast<double*>(oldp) = newval;
-    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
-    // cppcheck-suppress invalidPointerCast
-    self()->emitDouble(code, newval);
-}
-
 //=========================================================================
 // Primitives converting binary values to strings...
 
@@ -725,41 +753,86 @@ static inline void cvtQDataToStr(char* dstp, QData value) {
 
 #define cvtEDataToStr cvtIDataToStr
 
-//=============================================================================
+//=========================================================================
+// VerilatedTraceBuffer
 
-#ifdef VERILATED_VCD_TEST
-
-void verilated_trace_imp_selftest() {
-#define SELF_CHECK(got, exp) \
-    do { \
-        if ((got) != (exp)) VL_FATAL_MT(__FILE__, __LINE__, "", "%Error: selftest"); \
-    } while (0)
-
-#define SELF_CHECK_TS(scale) \
-    SELF_CHECK(doubleToTimescale(timescaleToDouble(scale)), std::string{scale});
-    SELF_CHECK_TS("100s");
-    SELF_CHECK_TS("10s");
-    SELF_CHECK_TS("1s");
-    SELF_CHECK_TS("100ms");
-    SELF_CHECK_TS("10ms");
-    SELF_CHECK_TS("1ms");
-    SELF_CHECK_TS("100us");
-    SELF_CHECK_TS("10us");
-    SELF_CHECK_TS("1us");
-    SELF_CHECK_TS("100ns");
-    SELF_CHECK_TS("10ns");
-    SELF_CHECK_TS("1ns");
-    SELF_CHECK_TS("100ps");
-    SELF_CHECK_TS("10ps");
-    SELF_CHECK_TS("1ps");
-    SELF_CHECK_TS("100fs");
-    SELF_CHECK_TS("10fs");
-    SELF_CHECK_TS("1fs");
-    SELF_CHECK_TS("100as");
-    SELF_CHECK_TS("10as");
-    SELF_CHECK_TS("1as");
+template <>  //
+VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::VerilatedTraceBuffer(VL_SUB_T& owner)
+    : m_owner{owner} {
+#ifdef VL_TRACE_OFFLOAD
+    if (m_offloadBufferWritep) {
+        using This = VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>*;
+        // Tack on the buffer address
+        static_assert(2 * sizeof(uint32_t) >= sizeof(This),
+                      "This should be enough on all plafrorms");
+        *m_offloadBufferWritep++ = VerilatedTraceOffloadCommand::TRACE_BUFFER;
+        *reinterpret_cast<This*>(m_offloadBufferWritep) = this;
+        m_offloadBufferWritep += 2;
+    }
+#endif
 }
 
-#endif
+// These functions must write the new value back into the old value store,
+// and subsequently call the format specific emit* implementations. Note
+// that this file must be included in the format specific implementation, so
+// the emit* functions can be inlined for performance.
+
+template <>  //
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullBit(uint32_t* oldp, CData newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitBit(code, newval);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullCData(uint32_t* oldp, CData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitCData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullSData(uint32_t* oldp, SData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitSData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullIData(uint32_t* oldp, IData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *oldp = newval;  // Still copy even if not tracing so chg doesn't call full
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitIData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullQData(uint32_t* oldp, QData newval, int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<QData*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitQData(code, newval, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullWData(uint32_t* oldp, const WData* newvalp,
+                                                         int bits) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    for (int i = 0; i < VL_WORDS_I(bits); ++i) oldp[i] = newvalp[i];
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    self()->emitWData(code, newvalp, bits);
+}
+
+template <>
+void VerilatedTraceBuffer<VL_SUB_T, VL_BUF_T>::fullDouble(uint32_t* oldp, double newval) {
+    const uint32_t code = oldp - m_sigs_oldvalp;
+    *reinterpret_cast<double*>(oldp) = newval;
+    if (VL_UNLIKELY(m_sigs_enabledp && !(VL_BITISSET_W(m_sigs_enabledp, code)))) return;
+    // cppcheck-suppress invalidPointerCast
+    self()->emitDouble(code, newval);
+}
 
 #endif  // VL_CPPCHECK
diff --git a/include/verilated_vcd_c.cpp b/include/verilated_vcd_c.cpp
index 78383befc..9db71aabc 100644
--- a/include/verilated_vcd_c.cpp
+++ b/include/verilated_vcd_c.cpp
@@ -62,12 +62,23 @@ constexpr unsigned VL_TRACE_MAX_VCD_CODE_SIZE = 5;  // Maximum length of a VCD s
 // cache-lines.
 constexpr unsigned VL_TRACE_SUFFIX_ENTRY_SIZE = 8;  // Size of a suffix entry
 
+//=============================================================================
+// Utility functions: TODO: put these in a common place and share them.
+
+template <size_t N> static size_t roundUpToMultipleOf(size_t value) {
+    static_assert((N & (N - 1)) == 0, "'N' must be a power of 2");
+    size_t mask = N - 1;
+    return (value + mask) & ~mask;
+}
+
 //=============================================================================
 // Specialization of the generics for this trace format
 
-#define VL_DERIVED_T VerilatedVcd
-#include "verilated_trace_imp.cpp"
-#undef VL_DERIVED_T
+#define VL_SUB_T VerilatedVcd
+#define VL_BUF_T VerilatedVcdBuffer
+#include "verilated_trace_imp.h"
+#undef VL_SUB_T
+#undef VL_BUF_T
 
 //=============================================================================
 //=============================================================================
@@ -183,7 +194,7 @@ void VerilatedVcd::makeNameMap() {
     deleteNameMap();
     m_namemapp = new NameMap;
 
-    VerilatedTrace<VerilatedVcd>::traceInit();
+    Super::traceInit();
 
     // Though not speced, it's illegal to generate a vcd with signals
     // not under any module - it crashes at least two viewers.
@@ -218,13 +229,17 @@ VerilatedVcd::~VerilatedVcd() {
     if (m_wrBufp) VL_DO_CLEAR(delete[] m_wrBufp, m_wrBufp = nullptr);
     deleteNameMap();
     if (m_filep && m_fileNewed) VL_DO_CLEAR(delete m_filep, m_filep = nullptr);
+#ifdef VL_TRACE_PARALLEL
+    assert(m_numBuffers == m_freeBuffers.size());
+    for (auto& pair : m_freeBuffers) VL_DO_CLEAR(delete[] pair.first, pair.first = nullptr);
+#endif
 }
 
 void VerilatedVcd::closePrev() {
     // This function is on the flush() call path
     if (!isOpen()) return;
 
-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
     bufferFlush();
     m_isOpen = false;
     m_filep->close();
@@ -251,14 +266,14 @@ void VerilatedVcd::close() VL_MT_SAFE_EXCLUDES(m_mutex) {
         printStr(" $end\n");
     }
     closePrev();
-    // closePrev() called VerilatedTrace<VerilatedVcd>::flush(), so we just
+    // closePrev() called Super::flush(), so we just
     // need to shut down the tracing thread here.
-    VerilatedTrace<VerilatedVcd>::closeBase();
+    Super::closeBase();
 }
 
 void VerilatedVcd::flush() VL_MT_SAFE_EXCLUDES(m_mutex) {
     const VerilatedLockGuard lock{m_mutex};
-    VerilatedTrace<VerilatedVcd>::flushBase();
+    Super::flushBase();
     bufferFlush();
 }
 
@@ -277,12 +292,12 @@ void VerilatedVcd::printQuad(uint64_t n) {
     printStr(buf);
 }
 
-void VerilatedVcd::bufferResize(uint64_t minsize) {
+void VerilatedVcd::bufferResize(size_t minsize) {
     // minsize is size of largest write.  We buffer at least 8 times as much data,
     // writing when we are 3/4 full (with thus 2*minsize remaining free)
     if (VL_UNLIKELY(minsize > m_wrChunkSize)) {
         const char* oldbufp = m_wrBufp;
-        m_wrChunkSize = minsize * 2;
+        m_wrChunkSize = roundUpToMultipleOf<1024>(minsize * 2);
         m_wrBufp = new char[m_wrChunkSize * 8];
         std::memcpy(m_wrBufp, oldbufp, m_writep - oldbufp);
         m_writep = m_wrBufp + (m_writep - oldbufp);
@@ -463,14 +478,16 @@ void VerilatedVcd::declare(uint32_t code, const char* name, const char* wirep, b
                            int arraynum, bool tri, bool bussed, int msb, int lsb) {
     const int bits = ((msb > lsb) ? (msb - lsb) : (lsb - msb)) + 1;
 
-    const bool enabled = VerilatedTrace<VerilatedVcd>::declCode(code, name, bits, tri);
+    const bool enabled = Super::declCode(code, name, bits, tri);
 
     if (m_suffixes.size() <= nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE) {
         m_suffixes.resize(nextCode() * VL_TRACE_SUFFIX_ENTRY_SIZE * 2, 0);
     }
 
-    // Make sure write buffer is large enough (one character per bit), plus header
-    bufferResize(bits + 1024);
+    // Keep upper bound on bytes a single signal cna emit into the buffer
+    m_maxSignalBytes = std::max<size_t>(m_maxSignalBytes, bits + 32);
+    // Make sure write buffer is large enough, plus header
+    bufferResize(m_maxSignalBytes + 1024);
 
     if (!enabled) return;
 
@@ -562,26 +579,73 @@ void VerilatedVcd::declArray(uint32_t code, const char* name, bool array, int ar
 void VerilatedVcd::declDouble(uint32_t code, const char* name, bool array, int arraynum) {
     declare(code, name, "real", array, arraynum, false, false, 63, 0);
 }
-#ifdef VL_TRACE_VCD_OLD_API
-void VerilatedVcd::declTriBit(uint32_t code, const char* name, bool array, int arraynum) {
-    declare(code, name, "wire", array, arraynum, true, false, 0, 0);
-}
-void VerilatedVcd::declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                              int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                               int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-void VerilatedVcd::declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb,
-                                int lsb) {
-    declare(code, name, "wire", array, arraynum, true, true, msb, lsb);
-}
-#endif  //  VL_TRACE_VCD_OLD_API
 
 //=============================================================================
-// Trace rendering prinitives
+// Get/commit trace buffer
+
+VerilatedVcdBuffer* VerilatedVcd::getTraceBuffer() {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // If no buffer available, allocate a new one
+    if (m_freeBuffers.empty()) {
+        constexpr size_t pageSize = 4096;
+        // 4 * m_maxSignalBytes, so we can reserve 2 * m_maxSignalBytes at the end for safety
+        size_t startingSize = roundUpToMultipleOf<pageSize>(4 * m_maxSignalBytes);
+        m_freeBuffers.emplace_back(new char[startingSize], startingSize);
+        ++m_numBuffers;
+    }
+    // Grab a buffer
+    const auto pair = m_freeBuffers.back();
+    m_freeBuffers.pop_back();
+    // Return the buffer
+    return new VerilatedVcdBuffer{*this, pair.first, pair.second};
+#else
+    return new VerilatedVcdBuffer{*this};
+#endif
+}
+
+void VerilatedVcd::commitTraceBuffer(VerilatedVcdBuffer* bufp) {
+#ifdef VL_TRACE_PARALLEL
+    // Note: This is called from VeriltedVcd::dump, which already holds the lock
+    // Resize output buffer. Note, we use the full size of the trace buffer, as
+    // this is a lot more stable than the actual occupancy of the trace buffer.
+    // This helps us to avoid re-allocations due to small size changes.
+    bufferResize(bufp->m_size);
+    // Compute occupancy of buffer
+    const size_t usedSize = bufp->m_writep - bufp->m_bufp;
+    // Copy to output buffer
+    std::memcpy(m_writep, bufp->m_bufp, usedSize);
+    // Adjust write pointer
+    m_writep += usedSize;
+    // Flush if necessary
+    bufferCheck();
+    // Put buffer back on free list
+    m_freeBuffers.emplace_back(bufp->m_bufp, bufp->m_size);
+#else
+    // Needs adjusting for emitTimeChange
+    m_writep = bufp->m_writep;
+#endif
+    delete bufp;
+}
+
+//=============================================================================
+// VerilatedVcdBuffer implementation
+
+#ifdef VL_TRACE_PARALLEL
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner}
+    , m_writep{bufp}
+    , m_bufp{bufp}
+    , m_size{size} {
+    adjustGrowp();
+}
+#else
+VerilatedVcdBuffer::VerilatedVcdBuffer(VerilatedVcd& owner)
+    : VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer>{owner} {}
+#endif
+
+//=============================================================================
+// Trace rendering primitives
 
 static inline void
 VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* suffixp) VL_ATTR_NO_SANITIZE_ALIGN;
@@ -606,26 +670,55 @@ static inline void VerilatedVcdCCopyAndAppendNewLine(char* writep, const char* s
 #endif
 }
 
-void VerilatedVcd::finishLine(uint32_t code, char* writep) {
-    const char* const suffixp = m_suffixes.data() + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
+void VerilatedVcdBuffer::finishLine(uint32_t code, char* writep) {
+    const char* const suffixp = m_suffixes + code * VL_TRACE_SUFFIX_ENTRY_SIZE;
     VL_DEBUG_IFDEF(assert(suffixp[0]););
     VerilatedVcdCCopyAndAppendNewLine(writep, suffixp);
 
     // Now write back the write pointer incremented by the actual size of the
     // suffix, which was stored in the last byte of the suffix buffer entry.
     m_writep = writep + suffixp[VL_TRACE_SUFFIX_ENTRY_SIZE - 1];
-    bufferCheck();
+
+#ifdef VL_TRACE_PARALLEL
+    // Double the size of the buffer if necessary
+    if (VL_UNLIKELY(m_writep >= m_growp)) {
+        // Compute occupied size of current buffer
+        const size_t usedSize = m_writep - m_bufp;
+        // We are always doubling the size
+        m_size *= 2;
+        // Allocate the new buffer
+        char* const newBufp = new char[m_size];
+        // Copy from current buffer to new buffer
+        std::memcpy(newBufp, m_bufp, usedSize);
+        // Delete current buffer
+        delete[] m_bufp;
+        // Make new buffer the current buffer
+        m_bufp = newBufp;
+        // Adjust write pointer
+        m_writep = m_bufp + usedSize;
+        // Adjust resize limit
+        adjustGrowp();
+    }
+#else
+    // Flush the write buffer if there's not enough space left for new information
+    // We only call this once per vector, so we need enough slop for a very wide "b###" line
+    if (VL_UNLIKELY(m_writep > m_wrFlushp)) {
+        m_owner.m_writep = m_writep;
+        m_owner.bufferFlush();
+        m_writep = m_owner.m_writep;
+    }
+#endif
 }
 
 //=============================================================================
 // emit* trace routines
 
 // Note: emit* are only ever called from one place (full* in
-// verilated_trace_imp.cpp, which is included in this file at the top),
+// verilated_trace_imp.h, which is included in this file at the top),
 // so always inline them.
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitBit(uint32_t code, CData newval) {
+void VerilatedVcdBuffer::emitBit(uint32_t code, CData newval) {
     // Don't prefetch suffix as it's a bit too late;
     char* wp = m_writep;
     *wp++ = '0' | static_cast<char>(newval);
@@ -633,7 +726,7 @@ void VerilatedVcd::emitBit(uint32_t code, CData newval) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
+void VerilatedVcdBuffer::emitCData(uint32_t code, CData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtCDataToStr(wp, newval << (VL_BYTESIZE - bits));
@@ -641,7 +734,7 @@ void VerilatedVcd::emitCData(uint32_t code, CData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
+void VerilatedVcdBuffer::emitSData(uint32_t code, SData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtSDataToStr(wp, newval << (VL_SHORTSIZE - bits));
@@ -649,7 +742,7 @@ void VerilatedVcd::emitSData(uint32_t code, SData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
+void VerilatedVcdBuffer::emitIData(uint32_t code, IData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtIDataToStr(wp, newval << (VL_IDATASIZE - bits));
@@ -657,7 +750,7 @@ void VerilatedVcd::emitIData(uint32_t code, IData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
+void VerilatedVcdBuffer::emitQData(uint32_t code, QData newval, int bits) {
     char* wp = m_writep;
     *wp++ = 'b';
     cvtQDataToStr(wp, newval << (VL_QUADSIZE - bits));
@@ -665,7 +758,7 @@ void VerilatedVcd::emitQData(uint32_t code, QData newval, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
+void VerilatedVcdBuffer::emitWData(uint32_t code, const WData* newvalp, int bits) {
     int words = VL_WORDS_I(bits);
     char* wp = m_writep;
     *wp++ = 'b';
@@ -682,272 +775,10 @@ void VerilatedVcd::emitWData(uint32_t code, const WData* newvalp, int bits) {
 }
 
 VL_ATTR_ALWINLINE
-void VerilatedVcd::emitDouble(uint32_t code, double newval) {
+void VerilatedVcdBuffer::emitDouble(uint32_t code, double newval) {
     char* wp = m_writep;
     // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(wp, m_wrChunkSize, "r%.16g", newval);
+    VL_SNPRINTF(wp, m_maxSignalBytes, "r%.16g", newval);
     wp += std::strlen(wp);
     finishLine(code, wp);
 }
-
-#ifdef VL_TRACE_VCD_OLD_API
-
-void VerilatedVcd::fullBit(uint32_t code, const uint32_t newval) {
-    // Note the &1, so we don't require clean input -- makes more common no change case faster
-    *oldp(code) = newval;
-    *m_writep++ = ('0' + static_cast<char>(newval & 1));
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullBus(uint32_t code, const uint32_t newval, int bits) {
-    *oldp(code) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1L << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullQuad(uint32_t code, const uint64_t newval, int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval & (1ULL << bit)) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint32_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 32)] & (1L << (bit & 0x1f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullArray(uint32_t code, const uint64_t* newval, int bits) {
-    for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) { oldp(code)[word] = newval[word]; }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = ((newval[(bit / 64)] & (1ULL << (bit & 0x3f))) ? '1' : '0');
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = "01zz"[newval | (newtri << 1)];
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri,
-                              int bits) {
-    oldp(code)[0] = newval;
-    oldp(code)[1] = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1) | (((newtri >> bit) & 1) << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri,
-                               int bits) {
-    (*(reinterpret_cast<uint64_t*>(oldp(code)))) = newval;
-    (*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) = newtri;
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        *m_writep++ = "01zz"[((newval >> bit) & 1ULL) | (((newtri >> bit) & 1ULL) << 1ULL)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                                int bits) {
-    for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-        oldp(code)[word * 2] = newvalp[word];
-        oldp(code)[word * 2 + 1] = newtrip[word];
-    }
-    *m_writep++ = 'b';
-    for (int bit = bits - 1; bit >= 0; --bit) {
-        uint32_t valbit = (newvalp[(bit / 32)] >> (bit & 0x1f)) & 1;
-        uint32_t tribit = (newtrip[(bit / 32)] >> (bit & 0x1f)) & 1;
-        *m_writep++ = "01zz"[valbit | (tribit << 1)];
-    }
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-void VerilatedVcd::fullDouble(uint32_t code, const double newval) {
-    // cppcheck-suppress invalidPointerCast
-    (*(reinterpret_cast<double*>(oldp(code)))) = newval;
-    // Buffer can't overflow before VL_SNPRINTF; we sized during declaration
-    VL_SNPRINTF(m_writep, m_wrChunkSize, "r%.16g", newval);
-    m_writep += std::strlen(m_writep);
-    *m_writep++ = ' ';
-    m_writep = writeCode(m_writep, code);
-    *m_writep++ = '\n';
-    bufferCheck();
-}
-
-#endif  // VL_TRACE_VCD_OLD_API
-
-//======================================================================
-//======================================================================
-//======================================================================
-
-#ifdef VERILATED_VCD_TEST
-#include <iostream>
-
-extern void verilated_trace_imp_selftest();
-
-uint32_t v1, v2, s1, s2[3];
-uint32_t tri96[3];
-uint32_t tri96__tri[3];
-uint64_t quad96[2];
-uint64_t tquad;
-uint64_t tquad__tri;
-uint8_t ch;
-uint64_t timestamp = 1;
-double doub = 0.0;
-float flo = 0.0f;
-
-void vcdInit(void*, VerilatedVcd* vcdp, uint32_t) {
-    vcdp->scopeEscape('.');
-    vcdp->pushNamePrefix("top.");
-    /**/ vcdp->declBus(0x2, "v1", -1, 0, 5, 1);
-    /**/ vcdp->declBus(0x3, "v2", -1, 0, 6, 1);
-    /**/ vcdp->pushNamePrefix("sub1.");
-    /***/ vcdp->declBit(0x4, "s1", -1, 0);
-    /***/ vcdp->declBit(0x5, "ch", -1, 0);
-    /**/ vcdp->popNamePrefix();
-    /**/ vcdp->pushNamePrefix("sub2.");
-    /***/ vcdp->declArray(0x6, "s2", -1, 0, 40, 3);
-    /**/ vcdp->popNamePrefix();
-    vcdp->popNamePrefix();
-    // Note need to add 3 for next code.
-    vcdp->pushNamePrefix("top2.");
-    /**/ vcdp->declBus(0x2, "t2v1", -1, 0, 4, 1);
-    /**/ vcdp->declTriBit(0x10, "io1", -1, 0);
-    /**/ vcdp->declTriBus(0x12, "io5", -1, 0, 4, 0);
-    /**/ vcdp->declTriArray(0x16, "io96", -1, 0, 95, 0);
-    /**/  // Note need to add 6 for next code.
-    /**/ vcdp->declDouble(0x1c, "doub", -1, 0);
-    /**/  // Note need to add 2 for next code.
-    /**/ vcdp->declArray(0x20, "q2", -1, 0, 95, 0);
-    /**/  // Note need to add 4 for next code.
-    /**/ vcdp->declTriQuad(0x24, "tq", -1, 0, 63, 0);
-    /**/  // Note need to add 4 for next code.
-    vcdp->popNamePrefix();
-}
-
-void vcdFull(void*, VerilatedVcd* vcdp) {
-    vcdp->fullBus(0x2, v1, 5);
-    vcdp->fullBus(0x3, v2, 7);
-    vcdp->fullBit(0x4, s1);
-    vcdp->fullBus(0x5, ch, 2);
-    vcdp->fullArray(0x6, &s2[0], 38);
-    vcdp->fullTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->fullTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->fullTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->fullDouble(0x1c, doub);
-    vcdp->fullArray(0x20, &quad96[0], 96);
-    vcdp->fullTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-void vcdChange(void*, VerilatedVcd* vcdp) {
-    vcdp->chgBus(0x2, v1, 5);
-    vcdp->chgBus(0x3, v2, 7);
-    vcdp->chgBit(0x4, s1);
-    vcdp->chgBus(0x5, ch, 2);
-    vcdp->chgArray(0x6, &s2[0], 38);
-    vcdp->chgTriBit(0x10, tri96[0] & 1, tri96__tri[0] & 1);
-    vcdp->chgTriBus(0x12, tri96[0] & 0x1f, tri96__tri[0] & 0x1f, 5);
-    vcdp->chgTriArray(0x16, tri96, tri96__tri, 96);
-    vcdp->chgDouble(0x1c, doub);
-    vcdp->chgArray(0x20, &quad96[0], 96);
-    vcdp->chgTriQuad(0x24, tquad, tquad__tri, 64);
-}
-
-// clang-format off
-void vcdTestMain(const char* filenamep) {
-    verilated_trace_imp_selftest();
-
-    v1 = v2 = s1 = 0;
-    s2[0] = s2[1] = s2[2] = 0;
-    tri96[2] = tri96[1] = tri96[0] = 0;
-    tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;
-    quad96[1] = quad96[0] = 0;
-    ch = 0;
-    doub = 0;
-    tquad = tquad__tri = 0;
-    {
-        VerilatedVcdC* vcdp = new VerilatedVcdC;
-        vcdp->evcd(true);
-        vcdp->set_time_unit("1ms");
-        vcdp->set_time_unit(std::string{"1ms"});
-        vcdp->set_time_resolution("1ns");
-        vcdp->set_time_resolution(std::string{"1ns"});
-        vcdp->spTrace()->addInitCb(&vcdInit, 0);
-        vcdp->spTrace()->addFullCb(&vcdFull, 0);
-        vcdp->spTrace()->addChgCb(&vcdChange, 0);
-        vcdp->open(filenamep);
-        // Dumping
-        vcdp->dump(++timestamp);
-        v1 = 0xfff;
-        tri96[2] = 4; tri96[1] = 2; tri96[0] = 1;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = ~0;  // Still tri
-        quad96[1] = 0xffffffff; quad96[0] = 0;
-        doub = 1.5;
-        flo = 1.4f;
-        vcdp->dump(++timestamp);
-        v2 = 0x1;
-        s2[1] = 2;
-        tri96__tri[2] = tri96__tri[1] = tri96__tri[0] = 0;  // enable w/o data change
-        quad96[1] = 0; quad96[0] = ~0;
-        doub = -1.66e13;
-        flo = 0.123f;
-        tquad = 0x00ff00ff00ff00ffULL;
-        tquad__tri = 0x0000fffff0000ffffULL;
-        vcdp->dump(++timestamp);
-        ch = 2;
-        tri96[2] = ~4; tri96[1] = ~2; tri96[0] = ~1;
-        doub = -3.33e-13;
-        vcdp->dump(++timestamp);
-        vcdp->dump(++timestamp);
-# ifdef VERILATED_VCD_TEST_64BIT
-        const uint64_t bytesPerDump = 15ULL;
-        for (uint64_t i = 0; i < ((1ULL << 32) / bytesPerDump); i++) {
-            v1 = i;
-            vcdp->dump(++timestamp);
-        }
-# endif
-        vcdp->close();
-        VL_DO_CLEAR(delete vcdp, vcdp = nullptr);
-    }
-}
-#endif
-// clang-format on
-
-//********************************************************************
-// ;compile-command: "v4make test_regress/t/t_trace_c_api.pl"
-//
-// Local Variables:
-// End:
diff --git a/include/verilated_vcd_c.h b/include/verilated_vcd_c.h
index 5fbb6022c..0d83eb25d 100644
--- a/include/verilated_vcd_c.h
+++ b/include/verilated_vcd_c.h
@@ -28,39 +28,20 @@
 #include <string>
 #include <vector>
 
-class VerilatedVcd;
-
-//=============================================================================
-// VerilatedFile
-/// Class representing a file to write to. These virtual methods can be
-/// overrode for e.g. socket I/O.
-
-class VerilatedVcdFile VL_NOT_FINAL {
-private:
-    int m_fd = 0;  // File descriptor we're writing to
-public:
-    // METHODS
-    /// Construct a (as yet) closed file
-    VerilatedVcdFile() = default;
-    /// Close and destruct
-    virtual ~VerilatedVcdFile() = default;
-    /// Open a file with given filename
-    virtual bool open(const std::string& name) VL_MT_UNSAFE;
-    /// Close object's file
-    virtual void close() VL_MT_UNSAFE;
-    /// Write data to file (if it is open)
-    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
-};
+class VerilatedVcdBuffer;
+class VerilatedVcdFile;
 
 //=============================================================================
 // VerilatedVcd
 // Base class to create a Verilator VCD dump
 // This is an internally used class - see VerilatedVcdC for what to call from applications
 
-class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd> {
+class VerilatedVcd VL_NOT_FINAL : public VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer> {
+public:
+    using Super = VerilatedTrace<VerilatedVcd, VerilatedVcdBuffer>;
+
 private:
-    // Give the superclass access to private bits (to avoid virtual functions)
-    friend class VerilatedTrace<VerilatedVcd>;
+    friend Buffer;  // Give the buffer access to the private bits
 
     //=========================================================================
     // VCD specific internals
@@ -74,9 +55,10 @@ private:
     int m_modDepth = 0;  // Depth of module hierarchy
 
     char* m_wrBufp;  // Output buffer
-    const char* m_wrFlushp;  // Output buffer flush trigger location
+    char* m_wrFlushp;  // Output buffer flush trigger location
     char* m_writep;  // Write pointer into output buffer
-    uint64_t m_wrChunkSize;  // Output buffer size
+    size_t m_wrChunkSize;  // Output buffer size
+    size_t m_maxSignalBytes = 0;  // Upper bound on number of bytes a single signal can generate
     uint64_t m_wroteBytes = 0;  // Number of bytes written to this file
 
     std::vector<char> m_suffixes;  // VCD line end string codes + metadata
@@ -84,7 +66,13 @@ private:
     using NameMap = std::map<const std::string, const std::string>;
     NameMap* m_namemapp = nullptr;  // List of names for the header
 
-    void bufferResize(uint64_t minsize);
+#ifdef VL_TRACE_PARALLEL
+    // Vector of free trace buffers as (pointer, size) pairs.
+    std::vector<std::pair<char*, size_t>> m_freeBuffers;
+    size_t m_numBuffers = 0;  // Number of trace buffers allocated
+#endif
+
+    void bufferResize(size_t minsize);
     void bufferFlush() VL_MT_UNSAFE_ONE;
     inline void bufferCheck() {
         // Flush the write buffer if there's not enough space left for new information
@@ -107,8 +95,6 @@ private:
 
     static char* writeCode(char* writep, uint32_t code);
 
-    void finishLine(uint32_t code, char* writep);
-
     // CONSTRUCTORS
     VL_UNCOPYABLE(VerilatedVcd);
 
@@ -116,27 +102,22 @@ protected:
     //=========================================================================
     // Implementation of VerilatedTrace interface
 
-    // Implementations of protected virtual methods for VerilatedTrace
+    // Called when the trace moves forward to a new time point
     virtual void emitTimeChange(uint64_t timeui) override;
 
     // Hooks called from VerilatedTrace
     virtual bool preFullDump() override { return isOpen(); }
     virtual bool preChangeDump() override;
 
-    // Implementations of duck-typed methods for VerilatedTrace. These are
-    // called from only one place (namely full*) so always inline them.
-    inline void emitBit(uint32_t code, CData newval);
-    inline void emitCData(uint32_t code, CData newval, int bits);
-    inline void emitSData(uint32_t code, SData newval, int bits);
-    inline void emitIData(uint32_t code, IData newval, int bits);
-    inline void emitQData(uint32_t code, QData newval, int bits);
-    inline void emitWData(uint32_t code, const WData* newvalp, int bits);
-    inline void emitDouble(uint32_t code, double newval);
+    // Trace buffer management
+    virtual VerilatedVcdBuffer* getTraceBuffer() override;
+    virtual void commitTraceBuffer(VerilatedVcdBuffer*) override;
 
 public:
     //=========================================================================
     // External interface to client code
 
+    // CONSTRUCTOR
     explicit VerilatedVcd(VerilatedVcdFile* filep = nullptr);
     ~VerilatedVcd();
 
@@ -144,7 +125,7 @@ public:
     // Set size in megabytes after which new file should be created
     void rolloverMB(uint64_t rolloverMB) { m_rolloverMB = rolloverMB; }
 
-    // METHODS
+    // METHODS - All must be thread safe
     // Open the file; call isOpen() to see if errors
     void open(const char* filename) VL_MT_SAFE_EXCLUDES(m_mutex);
     // Open next data-only file
@@ -164,168 +145,95 @@ public:
     void declQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
     void declArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
     void declDouble(uint32_t code, const char* name, bool array, int arraynum);
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    void declTriBit(uint32_t code, const char* name, bool array, int arraynum);
-    void declTriBus(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriQuad(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-    void declTriArray(uint32_t code, const char* name, bool array, int arraynum, int msb, int lsb);
-
-    void fullBit(uint32_t* oldp, CData newval) { fullBit(oldp - this->oldp(0), newval); }
-    void fullCData(uint32_t* oldp, CData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullSData(uint32_t* oldp, SData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullIData(uint32_t* oldp, IData newval, int bits) {
-        fullBus(oldp - this->oldp(0), newval, bits);
-    }
-    void fullQData(uint32_t* oldp, QData newval, int bits) {
-        fullQuad(oldp - this->oldp(0), newval, bits);
-    }
-    void fullWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        fullArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    void fullDouble(uint32_t* oldp, double newval) { fullDouble(oldp - this->oldp(0), newval); }
-
-    inline void chgBit(uint32_t* oldp, CData newval) { chgBit(oldp - this->oldp(0), newval); }
-    inline void chgCData(uint32_t* oldp, CData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgSData(uint32_t* oldp, SData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgIData(uint32_t* oldp, IData newval, int bits) {
-        chgBus(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgQData(uint32_t* oldp, QData newval, int bits) {
-        chgQuad(oldp - this->oldp(0), newval, bits);
-    }
-    inline void chgWData(uint32_t* oldp, const WData* newvalp, int bits) {
-        chgArray(oldp - this->oldp(0), newvalp, bits);
-    }
-    inline void chgDouble(uint32_t* oldp, double newval) {
-        chgDouble(oldp - this->oldp(0), newval);
-    }
-
-    // Inside dumping routines, dump one signal, faster when not inlined
-    // due to code size reduction.
-    void fullBit(uint32_t code, const uint32_t newval);
-    void fullBus(uint32_t code, const uint32_t newval, int bits);
-    void fullQuad(uint32_t code, const uint64_t newval, int bits);
-    void fullArray(uint32_t code, const uint32_t* newvalp, int bits);
-    void fullArray(uint32_t code, const uint64_t* newvalp, int bits);
-    void fullTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri);
-    void fullTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits);
-    void fullTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits);
-    void fullTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip, int bits);
-    void fullDouble(uint32_t code, const double newval);
-
-    // Inside dumping routines, dump one signal if it has changed.
-    // We do want to inline these to avoid calls when the value did not change.
-    inline void chgBit(uint32_t code, const uint32_t newval) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) fullBit(code, newval);
-    }
-    inline void chgBus(uint32_t code, const uint32_t newval, int bits) {
-        const uint32_t diff = oldp(code)[0] ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullBus(code, newval, bits);
-            }
-        }
-    }
-    inline void chgQuad(uint32_t code, const uint64_t newval, int bits) {
-        const uint64_t diff = (*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval;
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullQuad(code, newval, bits);
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint32_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY(oldp(code)[word] ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgArray(uint32_t code, const uint64_t* newvalp, int bits) {
-        for (int word = 0; word < (((bits - 1) / 64) + 1); ++word) {
-            if (VL_UNLIKELY(*(reinterpret_cast<uint64_t*>(oldp(code + 2 * word)))
-                            ^ newvalp[word])) {
-                fullArray(code, newvalp, bits);
-                return;
-            }
-        }
-    }
-    inline void chgTriBit(uint32_t code, const uint32_t newval, const uint32_t newtri) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            // Verilator 3.510 and newer provide clean input, so the below
-            // is only for back compatibility
-            if (VL_UNLIKELY(diff & 1)) {  // Change after clean?
-                fullTriBit(code, newval, newtri);
-            }
-        }
-    }
-    inline void chgTriBus(uint32_t code, const uint32_t newval, const uint32_t newtri, int bits) {
-        const uint32_t diff = ((oldp(code)[0] ^ newval) | (oldp(code)[1] ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 32 || (diff & ((1U << bits) - 1)))) {
-                fullTriBus(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriQuad(uint32_t code, const uint64_t newval, const uint64_t newtri, int bits) {
-        const uint64_t diff = (((*(reinterpret_cast<uint64_t*>(oldp(code)))) ^ newval)
-                               | ((*(reinterpret_cast<uint64_t*>(oldp(code + 1)))) ^ newtri));
-        if (VL_UNLIKELY(diff)) {
-            if (VL_UNLIKELY(bits == 64 || (diff & ((1ULL << bits) - 1)))) {
-                fullTriQuad(code, newval, newtri, bits);
-            }
-        }
-    }
-    inline void chgTriArray(uint32_t code, const uint32_t* newvalp, const uint32_t* newtrip,
-                            int bits) {
-        for (int word = 0; word < (((bits - 1) / 32) + 1); ++word) {
-            if (VL_UNLIKELY((oldp(code)[word * 2] ^ newvalp[word])
-                            | (oldp(code)[word * 2 + 1] ^ newtrip[word]))) {
-                fullTriArray(code, newvalp, newtrip, bits);
-                return;
-            }
-        }
-    }
-    inline void chgDouble(uint32_t code, const double newval) {
-        // cppcheck-suppress invalidPointerCast
-        if (VL_UNLIKELY((*(reinterpret_cast<double*>(oldp(code)))) != newval)) {
-            fullDouble(code, newval);
-        }
-    }
-
-    // METHODS
-    // Old/standalone API only
-    void evcd(bool flag) { m_evcd = flag; }
-#endif  // VL_TRACE_VCD_OLD_API
 };
 
 #ifndef DOXYGEN
-// Declare specializations here they are used in VerilatedVcdC just below
-template <> void VerilatedTrace<VerilatedVcd>::dump(uint64_t timeui);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_unit(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const char* unitp);
-template <> void VerilatedTrace<VerilatedVcd>::set_time_resolution(const std::string& unit);
-template <> void VerilatedTrace<VerilatedVcd>::dumpvars(int level, const std::string& hier);
+// Declare specialization here as it's used in VerilatedFstC just below
+template <> void VerilatedVcd::Super::dump(uint64_t time);
+template <> void VerilatedVcd::Super::set_time_unit(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_unit(const std::string& unit);
+template <> void VerilatedVcd::Super::set_time_resolution(const char* unitp);
+template <> void VerilatedVcd::Super::set_time_resolution(const std::string& unit);
+template <> void VerilatedVcd::Super::dumpvars(int level, const std::string& hier);
 #endif  // DOXYGEN
 
+//=============================================================================
+// VerilatedVcdBuffer
+
+class VerilatedVcdBuffer final : public VerilatedTraceBuffer<VerilatedVcd, VerilatedVcdBuffer> {
+    // Give the trace file access to the private bits
+    friend VerilatedVcd;
+    friend VerilatedVcd::Super;
+
+#ifdef VL_TRACE_PARALLEL
+    char* m_writep;  // Write pointer into m_bufp
+    char* m_bufp;  // The beginning of the trace buffer
+    size_t m_size;  // The size of the buffer at m_bufp
+    char* m_growp;  // Resize limit pointer
+#else
+    char* m_writep = m_owner.m_writep;  // Write pointer into output buffer
+    char* const m_wrFlushp = m_owner.m_wrFlushp;  // Output buffer flush trigger location
+#endif
+
+    // VCD line end string codes + metadata
+    const char* const m_suffixes = m_owner.m_suffixes.data();
+    // The maximum number of bytes a single signal can emit
+    const size_t m_maxSignalBytes = m_owner.m_maxSignalBytes;
+
+    void finishLine(uint32_t code, char* writep);
+
+#ifdef VL_TRACE_PARALLEL
+    void adjustGrowp() {
+        m_growp = (m_bufp + m_size) - (2 * m_maxSignalBytes);
+        assert(m_growp >= m_bufp + m_maxSignalBytes);
+    }
+#endif
+
+public:
+    // CONSTRUCTOR
+#ifdef VL_TRACE_PARALLEL
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner, char* bufp, size_t size);
+#else
+    explicit VerilatedVcdBuffer(VerilatedVcd& owner);
+#endif
+    ~VerilatedVcdBuffer() = default;
+
+    //=========================================================================
+    // Implementation of VerilatedTraceBuffer interface
+
+    // Implementations of duck-typed methods for VerilatedTraceBuffer. These are
+    // called from only one place (the full* methods), so always inline them.
+    VL_ATTR_ALWINLINE inline void emitBit(uint32_t code, CData newval);
+    VL_ATTR_ALWINLINE inline void emitCData(uint32_t code, CData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitSData(uint32_t code, SData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitIData(uint32_t code, IData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitQData(uint32_t code, QData newval, int bits);
+    VL_ATTR_ALWINLINE inline void emitWData(uint32_t code, const WData* newvalp, int bits);
+    VL_ATTR_ALWINLINE inline void emitDouble(uint32_t code, double newval);
+};
+
+//=============================================================================
+// VerilatedFile
+/// Class representing a file to write to. These virtual methods can be
+/// overrode for e.g. socket I/O.
+
+class VerilatedVcdFile VL_NOT_FINAL {
+private:
+    int m_fd = 0;  // File descriptor we're writing to
+public:
+    // METHODS
+    /// Construct a (as yet) closed file
+    VerilatedVcdFile() = default;
+    /// Close and destruct
+    virtual ~VerilatedVcdFile() = default;
+    /// Open a file with given filename
+    virtual bool open(const std::string& name) VL_MT_UNSAFE;
+    /// Close object's file
+    virtual void close() VL_MT_UNSAFE;
+    /// Write data to file (if it is open)
+    virtual ssize_t write(const char* bufp, ssize_t len) VL_MT_UNSAFE;
+};
+
 //=============================================================================
 // VerilatedVcdC
 /// Class representing a VCD dump file in C standalone (no SystemC)
@@ -396,16 +304,6 @@ public:
 
     // Internal class access
     inline VerilatedVcd* spTrace() { return &m_sptrace; }
-
-#ifdef VL_TRACE_VCD_OLD_API
-    //=========================================================================
-    // Note: These are only for testing for backward compatibility with foreign
-    // code and is not used by Verilator. Do not use these as there is no
-    // guarantee of functionality.
-
-    // Use evcd format
-    void evcd(bool flag) VL_MT_UNSAFE_ONE { m_sptrace.evcd(flag); }
-#endif
 };
 
 #endif  // guard
diff --git a/include/verilatedos.h b/include/verilatedos.h
index 28412cac4..6bacfe27b 100644
--- a/include/verilatedos.h
+++ b/include/verilatedos.h
@@ -40,6 +40,7 @@
 #ifdef __GNUC__
 # define VL_ATTR_ALIGNED(alignment) __attribute__((aligned(alignment)))
 # define VL_ATTR_ALWINLINE __attribute__((always_inline))
+# define VL_ATTR_NOINLINE __attribute__((noinline))
 # define VL_ATTR_COLD __attribute__((cold))
 # define VL_ATTR_HOT __attribute__((hot))
 # define VL_ATTR_NORETURN __attribute__((noreturn))
@@ -82,6 +83,9 @@
 #ifndef VL_ATTR_ALWINLINE
 # define VL_ATTR_ALWINLINE  ///< Attribute to inline, even when not optimizing
 #endif
+#ifndef VL_ATTR_NOINLINE
+# define VL_ATTR_NOINLINE  ///< Attribute to never inline, even when optimizing
+#endif
 #ifndef VL_ATTR_COLD
 # define VL_ATTR_COLD  ///< Attribute that function is rarely executed
 #endif
diff --git a/src/V3AstNodes.h b/src/V3AstNodes.h
index b200d121b..7d3189551 100644
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@@ -8533,6 +8533,7 @@ public:
     AstNodeDType* childDTypep() const { return VN_AS(op1p(), NodeDType); }
     void childDTypep(AstNodeDType* nodep) { setOp1p(nodep); }
     AstNode* itemsp() const { return op2p(); }  // op2 = AstPatReplicate, AstPatMember, etc
+    void addItemsp(AstNode* nodep) { addOp2p(nodep); }
 };
 class AstPatMember final : public AstNodeMath {
     // Verilog '{a} or '{a{b}}
diff --git a/src/V3AstUserAllocator.h b/src/V3AstUserAllocator.h
index d230f0829..8d63ad5a9 100644
--- a/src/V3AstUserAllocator.h
+++ b/src/V3AstUserAllocator.h
@@ -106,7 +106,7 @@ public:
     }
 
     // Get a reference to the user data
-    T_Data& operator()(const T_Node* nodep) {
+    T_Data& operator()(const T_Node* nodep) const {
         T_Data* const userp = getUserp(nodep);
         UASSERT_OBJ(userp, nodep, "Missing User data on const AstNode");
         return *userp;
diff --git a/src/V3Case.cpp b/src/V3Case.cpp
index 161f7db7e..c65fb3e7d 100644
--- a/src/V3Case.cpp
+++ b/src/V3Case.cpp
@@ -496,7 +496,7 @@ private:
         V3Case::caseLint(nodep);
         iterateChildren(nodep);
         if (debug() >= 9) nodep->dumpTree(cout, " case_old: ");
-        if (isCaseTreeFast(nodep) && v3Global.opt.oCase()) {
+        if (isCaseTreeFast(nodep) && v3Global.opt.fCase()) {
             // It's a simple priority encoder or complete statement
             // we can make a tree of statements to avoid extra comparisons
             ++m_statCaseFast;
diff --git a/src/V3Const.cpp b/src/V3Const.cpp
index 2cf230a7f..bfd6919df 100644
--- a/src/V3Const.cpp
+++ b/src/V3Const.cpp
@@ -111,6 +111,15 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
         BitPolarityEntry() = default;
     };
 
+    struct FrozenNodeInfo final {  // Context when a frozen node is found
+        bool m_polarity;
+        int m_lsb;
+        bool operator<(const FrozenNodeInfo& other) const {
+            if (m_lsb != other.m_lsb) return m_lsb < other.m_lsb;
+            return m_polarity < other.m_polarity;
+        }
+    };
+
     class Restorer final {  // Restore the original state unless disableRestore() is called
         ConstBitOpTreeVisitor& m_visitor;
         const size_t m_polaritiesSize;
@@ -299,7 +308,8 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
     LeafInfo* m_leafp = nullptr;  // AstConst or AstVarRef that currently looking for
     const AstNode* const m_rootp;  // Root of this AST subtree
 
-    std::vector<AstNode*> m_frozenNodes;  // Nodes that cannot be optimized
+    std::vector<std::pair<AstNode*, FrozenNodeInfo>>
+        m_frozenNodes;  // Nodes that cannot be optimized
     std::vector<BitPolarityEntry> m_bitPolarities;  // Polarity of bits found during iterate()
     std::vector<std::unique_ptr<VarInfo>> m_varInfos;  // VarInfo for each variable, [0] is nullptr
 
@@ -487,7 +497,7 @@ class ConstBitOpTreeVisitor final : public VNVisitor {
                     restorer.restoreNow();
                     // Reach past a cast then add to frozen nodes to be added to final reduction
                     if (const AstCCast* const castp = VN_CAST(opp, CCast)) opp = castp->lhsp();
-                    m_frozenNodes.push_back(opp);
+                    m_frozenNodes.emplace_back(opp, FrozenNodeInfo{m_polarity, m_lsb});
                     m_failed = origFailed;
                     continue;
                 }
@@ -652,17 +662,21 @@ public:
             }
         }
 
+        std::map<FrozenNodeInfo, std::vector<AstNode*>> frozenNodes;  // Group by FrozenNodeInfo
         // Check if frozen terms are clean or not
-        for (AstNode* const termp : visitor.m_frozenNodes) {
+        for (const auto& frozenInfo : visitor.m_frozenNodes) {
+            AstNode* const termp = frozenInfo.first;
             // Comparison operators are clean
-            if (VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
-                || VN_IS(termp, Gt) || VN_IS(termp, Gte)) {
+            if ((VN_IS(termp, Eq) || VN_IS(termp, Neq) || VN_IS(termp, Lt) || VN_IS(termp, Lte)
+                 || VN_IS(termp, Gt) || VN_IS(termp, Gte))
+                && frozenInfo.second.m_lsb == 0) {
                 hasCleanTerm = true;
             } else {
                 // Otherwise, conservatively assume the frozen term is dirty
                 hasDirtyTerm = true;
                 UINFO(9, "Dirty frozen term: " << termp << endl);
             }
+            frozenNodes[frozenInfo.second].push_back(termp);
         }
 
         // Figure out if a final negation is required
@@ -672,7 +686,12 @@ public:
         const bool needsCleaning = visitor.isAndTree() ? !hasCleanTerm : hasDirtyTerm;
 
         // Add size of reduction tree to op count
-        resultOps += termps.size() + visitor.m_frozenNodes.size() - 1;
+        resultOps += termps.size() - 1;
+        for (const auto& lsbAndNodes : frozenNodes) {
+            if (lsbAndNodes.first.m_lsb > 0) ++resultOps;  // Needs AstShiftR
+            if (!lsbAndNodes.first.m_polarity) ++resultOps;  // Needs AstNot
+            resultOps += lsbAndNodes.second.size();
+        }
         // Add final polarity flip in Xor tree
         if (needsFlip) ++resultOps;
         // Add final cleaning AND
@@ -681,7 +700,10 @@ public:
         if (debug() >= 9) {  // LCOV_EXCL_START
             cout << "Bitop tree considered: " << endl;
             for (AstNode* const termp : termps) termp->dumpTree("Reduced term: ");
-            for (AstNode* const termp : visitor.m_frozenNodes) termp->dumpTree("Frozen term: ");
+            for (const std::pair<AstNode*, FrozenNodeInfo>& termp : visitor.m_frozenNodes)
+                termp.first->dumpTree("Frozen term with lsb " + std::to_string(termp.second.m_lsb)
+                                      + " polarity " + std::to_string(termp.second.m_polarity)
+                                      + ": ");
             cout << "Needs flipping: " << needsFlip << endl;
             cout << "Needs cleaning: " << needsCleaning << endl;
             cout << "Size: " << resultOps << " input size: " << visitor.m_ops << endl;
@@ -724,8 +746,25 @@ public:
             resultp = reduce(resultp, termp);
         }
         // Add any frozen terms to the reduction
-        for (AstNode* const frozenp : visitor.m_frozenNodes) {
-            resultp = reduce(resultp, frozenp->unlinkFrBack());
+        for (auto&& nodes : frozenNodes) {
+            // nodes.second has same lsb and polarity
+            AstNode* termp = nullptr;
+            for (AstNode* const itemp : nodes.second) {
+                termp = reduce(termp, itemp->unlinkFrBack());
+            }
+            if (nodes.first.m_lsb > 0) {  // LSB is not 0, so shiftR
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstShiftR{termp->fileline(), termp,
+                                      new AstConst(termp->fileline(), AstConst::WidthedValue{},
+                                                   termp->width(), nodes.first.m_lsb)};
+                termp->dtypep(dtypep);
+            }
+            if (!nodes.first.m_polarity) {  // Polarity is inverted, so append Not
+                AstNodeDType* const dtypep = termp->dtypep();
+                termp = new AstNot{termp->fileline(), termp};
+                termp->dtypep(dtypep);
+            }
+            resultp = reduce(resultp, termp);
         }
 
         // Set width of masks to expected result width. This is required to prevent later removal
@@ -1051,7 +1090,7 @@ private:
 
     bool matchBitOpTree(AstNode* nodep) {
         if (nodep->widthMin() != 1) return false;
-        if (!v3Global.opt.oConstBitOpTree()) return false;
+        if (!v3Global.opt.fConstBitOpTree()) return false;
 
         string debugPrefix;
         if (debug() >= 9) {  // LCOV_EXCL_START
@@ -1373,7 +1412,7 @@ private:
         return (VN_IS(nodep, And) || VN_IS(nodep, Or) || VN_IS(nodep, Xor));
     }
     bool ifAdjacentSel(const AstSel* lhsp, const AstSel* rhsp) {
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         if (!lhsp || !rhsp) return false;
         const AstNode* const lfromp = lhsp->fromp();
         const AstNode* const rfromp = rhsp->fromp();
@@ -1388,7 +1427,7 @@ private:
     }
     bool ifMergeAdjacent(AstNode* lhsp, AstNode* rhsp) {
         // called by concatmergeable to determine if {lhsp, rhsp} make sense
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         // two same varref
         if (operandsSame(lhsp, rhsp)) return true;
         const AstSel* lselp = VN_CAST(lhsp, Sel);
@@ -1425,7 +1464,7 @@ private:
     }
     bool concatMergeable(const AstNode* lhsp, const AstNode* rhsp, unsigned depth) {
         // determine if {a OP b, c OP d} => {a, c} OP {b, d} is advantageous
-        if (!v3Global.opt.oAssemble()) return false;  // opt disabled
+        if (!v3Global.opt.fAssemble()) return false;  // opt disabled
         if (lhsp->type() != rhsp->type()) return false;
         if (!ifConcatMergeableBiop(lhsp)) return false;
         if (depth > CONCAT_MERGABLE_MAX_DEPTH) return false;  // As worse case O(n^2) algorithm
@@ -2511,7 +2550,7 @@ private:
             if (nodep->access().isReadOnly()
                 && ((!m_params  // Can reduce constant wires into equations
                      && m_doNConst
-                     && v3Global.opt.oConst()
+                     && v3Global.opt.fConst()
                      // Default value, not a "known" constant for this usage
                      && !nodep->varp()->isClassMember()
                      && !(nodep->varp()->isFuncLocal() && nodep->varp()->isNonOutput())
diff --git a/src/V3EmitCImp.cpp b/src/V3EmitCImp.cpp
index e07648a2d..2cc813751 100644
--- a/src/V3EmitCImp.cpp
+++ b/src/V3EmitCImp.cpp
@@ -752,26 +752,26 @@ class EmitCTrace final : EmitCFunc {
         const string func = nodep->full() ? "full" : "chg";
         bool emitWidth = true;
         if (nodep->dtypep()->basicp()->isDouble()) {
-            puts("tracep->" + func + "Double");
+            puts("bufp->" + func + "Double");
             emitWidth = false;
         } else if (nodep->isWide() || emitTraceIsScBv(nodep) || emitTraceIsScBigUint(nodep)) {
-            puts("tracep->" + func + "WData");
+            puts("bufp->" + func + "WData");
         } else if (nodep->isQuad()) {
-            puts("tracep->" + func + "QData");
+            puts("bufp->" + func + "QData");
         } else if (nodep->declp()->widthMin() > 16) {
-            puts("tracep->" + func + "IData");
+            puts("bufp->" + func + "IData");
         } else if (nodep->declp()->widthMin() > 8) {
-            puts("tracep->" + func + "SData");
+            puts("bufp->" + func + "SData");
         } else if (nodep->declp()->widthMin() > 1) {
-            puts("tracep->" + func + "CData");
+            puts("bufp->" + func + "CData");
         } else {
-            puts("tracep->" + func + "Bit");
+            puts("bufp->" + func + "Bit");
             emitWidth = false;
         }
 
         const uint32_t offset = (arrayindex < 0) ? 0 : (arrayindex * nodep->declp()->widthWords());
         const uint32_t code = nodep->declp()->code() + offset;
-        puts(v3Global.opt.useTraceOffloadThread() && !nodep->full() ? "(base+" : "(oldp+");
+        puts(v3Global.opt.useTraceOffload() && !nodep->full() ? "(base+" : "(oldp+");
         puts(cvtToStr(code - nodep->baseCode()));
         puts(",");
         emitTraceValue(nodep, arrayindex);
diff --git a/src/V3EmitCMake.cpp b/src/V3EmitCMake.cpp
index 67e8a741c..7df71dfeb 100644
--- a/src/V3EmitCMake.cpp
+++ b/src/V3EmitCMake.cpp
@@ -113,9 +113,8 @@ class CMakeEmitter final {
         cmake_set_raw(*of, name + "_COVERAGE", v3Global.opt.coverage() ? "1" : "0");
         *of << "# Threaded output mode?  0/1/N threads (from --threads)\n";
         cmake_set_raw(*of, name + "_THREADS", cvtToStr(v3Global.opt.threads()));
-        *of << "# Threaded tracing output mode?  0/1/N threads (from --trace-threads)\n";
-        cmake_set_raw(*of, name + "_TRACE_THREADS",
-                      cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        *of << "# Threaded tracing output mode?  0/1/N threads (from --threads/--trace-threads)\n";
+        cmake_set_raw(*of, name + "_TRACE_THREADS", cvtToStr(v3Global.opt.vmTraceThreads()));
         cmake_set_raw(*of, name + "_TRACE_FST_WRITER_THREAD",
                       v3Global.opt.traceThreads() && v3Global.opt.traceFormat().fst() ? "1" : "0");
         *of << "# Struct output mode?  0/1 (from --trace-structs)\n";
diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp
index 429b78d33..b748d9553 100644
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@@ -73,9 +73,10 @@ public:
         of.puts("VM_TRACE_FST = ");
         of.puts(v3Global.opt.trace() && v3Global.opt.traceFormat().fst() ? "1" : "0");
         of.puts("\n");
-        of.puts("# Tracing threaded output mode?  0/1/N threads (from --trace-thread)\n");
+        of.puts(
+            "# Tracing threaded output mode?  0/1/N threads (from --threads/--trace-thread)\n");
         of.puts("VM_TRACE_THREADS = ");
-        of.puts(cvtToStr(v3Global.opt.useTraceOffloadThread()));
+        of.puts(cvtToStr(v3Global.opt.vmTraceThreads()));
         of.puts("\n");
         of.puts("# Separate FST writer thread? 0/1 (from --trace-fst with --trace-thread > 0)\n");
         of.puts("VM_TRACE_FST_WRITER_THREAD = ");
diff --git a/src/V3Gate.cpp b/src/V3Gate.cpp
index 4b66c2661..cf3485121 100644
--- a/src/V3Gate.cpp
+++ b/src/V3Gate.cpp
@@ -397,11 +397,11 @@ private:
         // Then propagate more complicated equations
         optimizeSignals(true);
         // Remove redundant logic
-        if (v3Global.opt.oDedupe()) {
+        if (v3Global.opt.fDedupe()) {
             dedupe();
             if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_dedup");
         }
-        if (v3Global.opt.oAssemble()) {
+        if (v3Global.opt.fAssemble()) {
             mergeAssigns();
             if (debug() >= 6) m_graph.dumpDotFilePrefixed("gate_assm");
         }
diff --git a/src/V3GraphAcyc.cpp b/src/V3GraphAcyc.cpp
index a62fd3d9d..0df758ed1 100644
--- a/src/V3GraphAcyc.cpp
+++ b/src/V3GraphAcyc.cpp
@@ -254,7 +254,7 @@ void GraphAcyc::simplify(bool allowCut) {
         if (allowCut) {
             // The main algorithm works without these, though slower
             // So if changing the main algorithm, comment these out for a test run
-            if (v3Global.opt.oAcycSimp()) {
+            if (v3Global.opt.fAcycSimp()) {
                 cutBasic(vertexp);
                 cutBackward(vertexp);
             }
diff --git a/src/V3MergeCond.cpp b/src/V3MergeCond.cpp
index 673326f27..3881c48df 100644
--- a/src/V3MergeCond.cpp
+++ b/src/V3MergeCond.cpp
@@ -42,6 +42,34 @@
 //
 //  Also merges consecutive AstNodeIf statements with the same condition.
 //
+//  Because this optimization has notable performance impact, we go further
+//  and perform code motion to try to move mergeable conditionals next to each
+//  other, which in turn enable us to merge more conditionals. To do this, we
+//  perform an analysis pass, followed by an optimization pass on the whole
+//  AstCFunc we are optimizing.
+//
+//  The analysis pass gathers, for each statement in the tree, the information
+//  relevant for determining whether two statements can be swapped, and some
+//  other additional information that is useful during optimization.
+//
+//  The optimization pass tries to move conditionals near each other, first by
+//  trying to move a conditional node backwards in the list, so it becomes the
+//  direct successor of another earlier conditional with the same condition.
+//  If this is not possible due to variable interference, then we additionally
+//  try to pull earlier conditionals with the same condition closer forward to
+//  be the immediate predecessor of the conditional node. We limit maximum
+//  distance a node can travel to an empirically chosen but otherwise arbitrary
+//  constant. This limits worst case complexity to be O(n) rather than O(n^2).
+//  The worst case complexity manifests when N/2 conditionals, all with unique
+//  conditions are succeeded by N/2 conditionals with the same unique
+//  conditions, such that each unique condition is used by exactly 2
+//  conditionals. In this case N/2 all nodes need to travel approx N/2 distance.
+//  Limiting the distance bounds the latter, hence limiting complexity.
+//
+//  Once the analysis and optimization passes have been applied to the whole
+//  function, any merged conditionals will then undergo the same analysis,
+//  optimization, and merging again in their individual branches.
+//
 //*************************************************************************
 
 #include "config_build.h"
@@ -51,71 +79,364 @@
 #include "V3MergeCond.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
+#include "V3AstUserAllocator.h"
+#include "V3Hasher.h"
+#include "V3DupFinder.h"
+
+#include <queue>
+#include <set>
+
+namespace {
 
 //######################################################################
+// Utilities
 
-enum class Mergeable {
-    YES,  // Tree can be merged
-    NO_COND_ASSIGN,  // Tree cannot be merged because it contains an assignment to a condition
-    NO_IMPURE  // Tree cannot be merged because it contains an impure node
+// This function extracts the Cond node from the RHS of an assignment,
+// if there is one and it is in a supported position, which are:
+// - RHS is the Cond
+// - RHS is And(Const, Cond). This And is inserted often by V3Clean.
+AstNodeCond* extractCondFromRhs(AstNode* rhsp) {
+    if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
+        return condp;
+    } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
+        if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
+            if (VN_IS(andp->lhsp(), Const)) return condp;
+        }
+    }
+    return nullptr;
+}
+
+// Predicate to check if two sets are disjoint. This is stable, as we only need
+// to determine if the sets contain a shared element, which is a boolean
+// property. It is also efficient as we use sorted sets, and therefore can
+// enumerate elements in order (what the ordering is, is unimportant), meaning
+// the worst case complexity is O(size of smaller set).
+bool areDisjoint(const std::set<const AstVar*>& a, const std::set<const AstVar*>& b) {
+    if (a.empty() || b.empty()) return true;
+    const auto endA = a.end();
+    const auto endB = b.end();
+    auto itA = a.begin();
+    auto itB = b.begin();
+    while (true) {
+        if (*itA == *itB) return false;
+        if (std::less<const AstVar*>{}(*itA, *itB)) {
+            itA = std::lower_bound(++itA, endA, *itB);
+            if (itA == endA) return true;
+        } else {
+            itB = std::lower_bound(++itB, endB, *itA);
+            if (itB == endB) return true;
+        }
+    }
+}
+
+//######################################################################
+// Structure containing information required for code motion/merging
+
+struct StmtProperties {
+    AstNode* m_condp = nullptr;  // The condition expression, if a conditional node
+    std::set<const AstVar*> m_rdVars;  // Variables read by this statement
+    std::set<const AstVar*> m_wrVars;  // Variables writen by this statement
+    bool m_isFence = false;  // Nothing should move across this statement, nor should it be merged
+    AstNodeStmt* m_prevWithSameCondp = nullptr;  // Previous node in same list, with same condition
+    bool writesConditionVar() const {
+        // This relies on MarkVarsVisitor having been called on the condition node
+        for (const AstVar* const varp : m_wrVars) {
+            if (varp->user1()) return true;
+        }
+        return false;
+    }
 };
 
-class CheckMergeableVisitor final : public VNVisitor {
-private:
-    // STATE
-    bool m_condAssign = false;  // Does this tree contain an assignment to a condition variable??
-    bool m_impure = false;  // Does this tree contain an impure node?
+// We store the statement properties in user3 via AstUser3Allocator
+using StmtPropertiesAllocator = AstUser3Allocator<AstNodeStmt, StmtProperties>;
 
-    // METHODS
-    VL_DEBUG_FUNC;  // Declare debug()
+//######################################################################
+// Code motion analysis and implementation
 
-    // VISITORS
-    virtual void visit(AstNode* nodep) override {
-        if (m_impure) return;
-        // Clear if node is impure
-        if (!nodep->isPure()) {
-            UINFO(9, "Not mergeable due to impure node" << nodep << endl);
-            m_impure = true;
-            return;
+// Pure analysis visitor that build the StmtProperties for each statement in the given
+// AstNode list (following AstNode::nextp())
+class CodeMotionAnalysisVisitor final : public VNVisitor {
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNode::user4       -> Used by V3Hasher
+    // AstNode::user5       -> AstNode*: Set on a condition node, points to the last conditional
+    //                         with that condition so far encountered in the same AstNode list
+
+    VNUser5InUse m_user5InUse;
+
+    StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+    V3Hasher m_hasher;  // Used by V3DupFinder
+    // Stack of a V3DupFinder used for finding identical condition expressions within one
+    // statement list.
+    std::vector<V3DupFinder> m_stack;
+    StmtProperties* m_propsp = nullptr;  // StmtProperties structure of current AstNodeStmt
+
+    // Extract condition expression from a megeable conditional statement, if any
+    static AstNode* extractCondition(const AstNodeStmt* nodep) {
+        AstNode* conditionp = nullptr;
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            if (AstNodeCond* const conditionalp = extractCondFromRhs(assignp->rhsp())) {
+                conditionp = conditionalp->condp();
+            }
+        } else if (const AstNodeIf* const ifp = VN_CAST(nodep, NodeIf)) {
+            conditionp = ifp->condp();
         }
+        while (AstCCast* const castp = VN_CAST(conditionp, CCast)) conditionp = castp->lhsp();
+        return conditionp;
+    }
+
+    void analyzeStmt(AstNodeStmt* nodep, bool tryCondMatch) {
+        VL_RESTORER(m_propsp);
+        // Keep hold of props of enclosing statement
+        StmtProperties* const outerPropsp = m_propsp;
+        // Grab the props of this statement
+        m_propsp = &m_stmtProperties(nodep);
+
+        // Extract condition from statement
+        if (AstNode* const condp = extractCondition(nodep)) {
+            // Remember condition node. We always need this as it is used in the later
+            // traversal.
+            m_propsp->m_condp = condp;
+            // If this is a conditional statement, try to find an earlier one with the same
+            // condition in the same list (unless we have been told not to bother because we know
+            // this node is in a singleton list).
+            if (tryCondMatch) {
+                // Grab the duplicate finder of this list
+                V3DupFinder& dupFinder = m_stack.back();
+                // Find a duplicate condition
+                const V3DupFinder::iterator& dit = dupFinder.findDuplicate(condp);
+                if (dit == dupFinder.end()) {
+                    // First time seeing this condition in the current list
+                    dupFinder.insert(condp);
+                    // Remember last statement with this condition (which is this statement)
+                    condp->user5p(nodep);
+                } else {
+                    // Seen a conditional with the same condition earlier in the current list
+                    AstNode* const firstp = dit->second;
+                    // Add to properties for easy retrieval during optimization
+                    m_propsp->m_prevWithSameCondp = static_cast<AstNodeStmt*>(firstp->user5p());
+                    // Remember last statement with this condition (which is this statement)
+                    firstp->user5p(nodep);
+                }
+            }
+        }
+
+        // Analyse this statement
+        analyzeNode(nodep);
+
+        // If there is an enclosing statement, propagate properties upwards
+        if (outerPropsp) {
+            // Add all rd/wr vars to outer statement
+            outerPropsp->m_rdVars.insert(m_propsp->m_rdVars.cbegin(), m_propsp->m_rdVars.cend());
+            outerPropsp->m_wrVars.insert(m_propsp->m_wrVars.cbegin(), m_propsp->m_wrVars.cend());
+            // If this statement is impure, the enclosing statement is also impure
+            if (m_propsp->m_isFence) outerPropsp->m_isFence = true;
+        }
+    }
+
+    void analyzeVarRef(AstVarRef* nodep) {
+        const VAccess access = nodep->access();
+        AstVar* const varp = nodep->varp();
+        // Gather read and written variables
+        if (access.isReadOrRW()) m_propsp->m_rdVars.insert(varp);
+        if (access.isWriteOrRW()) m_propsp->m_wrVars.insert(varp);
+    }
+
+    void analyzeNode(AstNode* nodep) {
+        // If an impure node under a statement, mark that statement as impure
+        if (m_propsp && !nodep->isPure()) m_propsp->m_isFence = true;
+        // Analyze children
         iterateChildrenConst(nodep);
     }
-    virtual void visit(AstVarRef* nodep) override {
-        if (m_impure || m_condAssign) return;
-        // Clear if it's an LValue referencing a marked variable
-        if (nodep->access().isWriteOrRW() && nodep->varp()->user1()) {
-            UINFO(9, "Not mergeable due assignment to condition" << nodep << endl);
-            m_condAssign = true;
+
+    // VISITORS
+    void visit(AstNode* nodep) override {
+        // Push a new stack entry at the start of a list, but only if the list is not a
+        // single element (this saves a lot of allocations in expressions)
+        bool singletonListStart = false;
+        if (nodep->backp()->nextp() != nodep) {  // If at head of list
+            singletonListStart = nodep->nextp() == nullptr;
+            if (!singletonListStart) m_stack.emplace_back(m_hasher);
         }
+
+        // Analyse node
+        if (AstNodeStmt* const stmtp = VN_CAST(nodep, NodeStmt)) {
+            analyzeStmt(stmtp, /*tryCondMatch:*/ !singletonListStart);
+        } else if (AstVarRef* const vrefp = VN_CAST(nodep, VarRef)) {
+            analyzeVarRef(vrefp);
+        } else {
+            analyzeNode(nodep);
+        }
+
+        // Pop the stack at the end of a list
+        if (!singletonListStart && !nodep->nextp()) m_stack.pop_back();
+    }
+
+    // CONSTRUCTOR
+    CodeMotionAnalysisVisitor(AstNode* nodep, StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        iterateAndNextConstNull(nodep);
     }
 
 public:
-    CheckMergeableVisitor() = default;
-
-    // Return false if this node should not be merged at all because:
-    // - It contains an impure expression
-    // - It contains an LValue referencing the condition
-    Mergeable operator()(const AstNode* node) {
-        m_condAssign = false;
-        m_impure = false;
-        iterateChildrenConst(const_cast<AstNode*>(node));
-        if (m_impure) {  // Impure is stronger than cond assign
-            return Mergeable::NO_IMPURE;
-        } else if (m_condAssign) {
-            return Mergeable::NO_COND_ASSIGN;
-        } else {
-            return Mergeable::YES;
-        }
+    // Analyse the statement list starting at nodep, filling in stmtProperties.
+    static void analyze(AstNode* nodep, StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionAnalysisVisitor{nodep, stmtProperties};
     }
 };
 
+class CodeMotionOptimizeVisitor final : public VNVisitor {
+    // Do not move a node more than this many statements.
+    // This bounds complexity at O(N), rather than O(N^2).
+    static constexpr unsigned MAX_DISTANCE = 500;
+
+    // NODE STATE
+    // AstNodeStmt::user3   -> StmtProperties (accessed via m_stmtProperties, managed externally,
+    //                         see MergeCondVisitor::process)
+    // AstNodeStmt::user4   -> bool: Already processed this node
+
+    VNUser4InUse m_user4InUse;
+
+    const StmtPropertiesAllocator& m_stmtProperties;
+
+    // MEMBERS
+
+    // Predicate that checks if the order of two statements can be swapped
+    bool areSwappable(const AstNodeStmt* ap, const AstNodeStmt* bp) const {
+        const StmtProperties& aProps = m_stmtProperties(ap);
+        const StmtProperties& bProps = m_stmtProperties(bp);
+        // Don't move across fences
+        if (aProps.m_isFence) return false;
+        if (bProps.m_isFence) return false;
+        // If either statement writes a variable that the other reads, they are not swappable
+        if (!areDisjoint(aProps.m_rdVars, bProps.m_wrVars)) return false;
+        if (!areDisjoint(bProps.m_rdVars, aProps.m_wrVars)) return false;
+        // If they both write to the same variable, they are not swappable
+        if (!areDisjoint(aProps.m_wrVars, bProps.m_wrVars)) return false;
+        // Otherwise good to go
+        return true;
+    }
+
+    // VISITORS
+    void visit(AstNodeStmt* nodep) override {
+        // Process only on first encounter
+        if (nodep->user4SetOnce()) return;
+        // First re-order children
+        iterateChildren(nodep);
+        // Grab hold of previous node with same condition
+        AstNodeStmt* prevp = m_stmtProperties(nodep).m_prevWithSameCondp;
+        // If no previous node with same condition, we are done
+        if (!prevp) return;
+#ifdef VL_DEBUG
+        {  // Sanity check, only in debug build, otherwise expensive
+            const AstNode* currp = prevp;
+            while (currp && currp != nodep) currp = currp->nextp();
+            UASSERT_OBJ(currp, nodep, "Predecessor not in same list as " << currp);
+        }
+#endif
+        // Otherwise try to move this node backwards, as close as we can to the previous node
+        // with the same condition
+        if (AstNodeStmt* predp = VN_CAST(nodep->backp(), NodeStmt)) {
+            // 'predp' is the newly computed predecessor node of 'nodep', which is initially
+            // (without movement) the 'backp' of the node.
+            for (unsigned i = MAX_DISTANCE; i; --i) {
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) break;
+                // Don't move past a non-statement (e.g.: AstVar), or end of list
+                AstNodeStmt* const backp = VN_CAST(predp->backp(), NodeStmt);
+                if (!backp) break;
+                // Don't swap statements if doing so would change program semantics
+                if (!areSwappable(predp, nodep)) break;
+                // Otherwise move 'nodep' back
+                predp = backp;
+            }
+
+            // If we decided that 'nodep' should be moved back
+            if (nodep->backp() != predp) {
+                // Move the current node to directly follow the computed predecessor
+                nodep->unlinkFrBack();
+                predp->addNextHere(nodep);
+                // If the predecessor is the previous node with the same condition, job done
+                if (predp == prevp) return;
+            }
+        }
+        // If we reach here, it means we were unable to move the current node all the way back
+        // such that it immediately follows the previous statement with the same condition. Now
+        // try to move all previous statements with the same condition forward, in the hope of
+        // compacting the list further.
+        for (AstNodeStmt* currp = nodep; prevp;
+             currp = prevp, prevp = m_stmtProperties(currp).m_prevWithSameCondp) {
+            // Move prevp (previous statement with same condition) towards currp
+            if (AstNodeStmt* succp = VN_CAST(prevp->nextp(), NodeStmt)) {
+                // 'succp' is the newly computed successor node of 'prevp', which is initially
+                // (without movement) the 'nextp' of the node.
+                for (unsigned i = MAX_DISTANCE; --i;) {
+                    // If the successor of the previous statement with same condition is the
+                    // target node, we are done with this predecessor
+                    if (succp == currp) break;
+                    // Don't move past a non-statement (e.g.: AstVar), or end of list
+                    AstNodeStmt* const nextp = VN_CAST(succp->nextp(), NodeStmt);
+                    if (!nextp) break;
+                    // Don't swap statements if doing so would change program semantics
+                    if (!areSwappable(prevp, succp)) break;
+                    // Otherwise move further forward
+                    succp = nextp;
+                }
+
+                // If we decided that 'prevp' should be moved forward
+                if (prevp->nextp() != succp) {
+                    // Move the current node to directly before the computed successor
+                    prevp->unlinkFrBack();
+                    succp->addHereThisAsNext(prevp);
+                }
+            }
+        }
+    }
+
+    void visit(AstNode* nodep) override {}  // Ignore all non-statements
+
+    // CONSTRUCTOR
+    CodeMotionOptimizeVisitor(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties)
+        : m_stmtProperties(stmtProperties) {
+        // We assert the given node is at the head of the list otherwise we might move a node
+        // before the given node. This is easy to fix in the above iteration with a check on a
+        // boundary node we should not move past, if we ever need to do so.
+        // Note: we will do iterateAndNextNull which requires nodep->backp() != nullptr anyway
+        UASSERT_OBJ(nodep->backp()->nextp() != nodep, nodep, "Must be at head of list");
+        // Optimize the list
+        iterateAndNextNull(nodep);
+    }
+
+public:
+    // Given an AstNode list (held via AstNode::nextp()), move conditional statements as close
+    // together as possible
+    static AstNode* optimize(AstNode* nodep, const StmtPropertiesAllocator& stmtProperties) {
+        CodeMotionOptimizeVisitor{nodep, stmtProperties};
+        // It is possible for the head of the list to be moved later such that it is no longer
+        // in head position. If so, rewind the list and return the new head.
+        while (nodep->backp()->nextp() == nodep) nodep = nodep->backp();
+        return nodep;
+    }
+};
+
+//######################################################################
+// Conditional merging
+
 class MergeCondVisitor final : public VNVisitor {
 private:
     // NODE STATE
-    // AstVar::user1        -> Flag set for variables referenced by m_mgCondp
-    // AstNode::user2       -> Flag marking node as included in merge because cheap to duplicate
-    const VNUser1InUse m_user1InUse;
-    const VNUser2InUse m_user2InUse;
+    // AstVar::user1        -> bool: Set for variables referenced by m_mgCondp
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user2       -> bool: Marking node as included in merge because cheap to
+    //                         duplicate
+    //                         (Only below MergeCondVisitor::process).
+    // AstNodeStmt::user3   -> StmtProperties
+    //                         (Only below MergeCondVisitor::process).
+    // AstNode::user4       -> See CodeMotionAnalysisVisitor/CodeMotionOptimizeVisitor
+    // AstNode::user5       -> See CodeMotionAnalysisVisitor
 
     // STATE
     VDouble0 m_statMerges;  // Statistic tracking
@@ -128,24 +449,84 @@ private:
     const AstNode* m_mgNextp = nullptr;  // Next node in list being examined
     uint32_t m_listLenght = 0;  // Length of current list
 
-    CheckMergeableVisitor m_checkMergeable;  // Sub visitor for encapsulation & speed
+    std::queue<AstNode*>* m_workQueuep = nullptr;  // Node lists (via AstNode::nextp()) to merge
+    // Statement properties for code motion and merging
+    StmtPropertiesAllocator* m_stmtPropertiesp = nullptr;
 
     // METHODS
     VL_DEBUG_FUNC;  // Declare debug()
 
-    // This function extracts the Cond node from the RHS, if there is one and
-    // it is in a supported position, which are:
-    // - RHS is the Cond
-    // - RHS is And(Const, Cond). This And is inserted often by V3Clean.
-    static AstNodeCond* extractCond(AstNode* rhsp) {
-        if (AstNodeCond* const condp = VN_CAST(rhsp, NodeCond)) {
-            return condp;
-        } else if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
-            if (AstNodeCond* const condp = VN_CAST(andp->rhsp(), NodeCond)) {
-                if (VN_IS(andp->lhsp(), Const)) return condp;
-            }
+    // Function that processes a whole sub-tree
+    void process(AstNode* nodep) {
+        // Set up work queue
+        std::queue<AstNode*> workQueue;
+        m_workQueuep = &workQueue;
+        m_workQueuep->push(nodep);
+
+        do {
+            // Set up user* for this iteration
+            const VNUser1InUse user1InUse;
+            const VNUser2InUse user2InUse;
+            const VNUser3InUse user3InUse;
+            // Statement properties only preserved for this iteration,
+            // then memory is released immediately.
+            StmtPropertiesAllocator stmtProperties;
+            m_stmtPropertiesp = &stmtProperties;
+
+            // Pop off current work item
+            AstNode* currp = m_workQueuep->front();
+            m_workQueuep->pop();
+
+            // Analyse sub-tree list for code motion
+            CodeMotionAnalysisVisitor::analyze(currp, stmtProperties);
+            // Perform the code motion within the whole sub-tree list
+            currp = CodeMotionOptimizeVisitor::optimize(currp, stmtProperties);
+
+            // Merge conditionals in the whole sub-tree list (this might create new work items)
+            iterateAndNextNull(currp);
+
+            // Close pending merge, if there is one at the end of the whole sub-tree list
+            if (m_mgFirstp) mergeEnd();
+        } while (!m_workQueuep->empty());
+    }
+
+    // Skip past AstArraySel and AstWordSel with const index
+    static AstNode* skipConstSels(AstNode* nodep) {
+        while (const AstArraySel* const aselp = VN_CAST(nodep, ArraySel)) {
+            // ArraySel index is not constant, so might be expensive
+            if (!VN_IS(aselp->bitp(), Const)) return nodep;
+            nodep = aselp->fromp();
         }
-        return nullptr;
+        while (const AstWordSel* const wselp = VN_CAST(nodep, WordSel)) {
+            // WordSel index is not constant, so might be expensive
+            if (!VN_IS(wselp->bitp(), Const)) return nodep;
+            nodep = wselp->fromp();
+        }
+        return nodep;
+    }
+
+    // Check if this node is cheap enough that duplicating it in two branches of an
+    // AstIf is not likely to cause a performance degradation.
+    static bool isCheapNode(AstNode* nodep) {
+        // Comments are cheap
+        if (VN_IS(nodep, Comment)) return true;
+        // So are some assignments
+        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
+            // Check LHS
+            AstNode* const lhsp = skipConstSels(assignp->lhsp());
+            // LHS is not a VarRef, so might be expensive
+            if (!VN_IS(lhsp, VarRef)) return false;
+
+            // Check RHS
+            AstNode* const rhsp = skipConstSels(assignp->rhsp());
+            // RHS is not a VarRef or Constant so might be expensive
+            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
+
+            // Otherwise it is a cheap assignment
+            return true;
+        }
+        // Others are not
+        return false;
     }
 
     // Predicate to check if an expression yields only 0 or 1 (i.e.: a 1-bit value)
@@ -196,23 +577,21 @@ private:
     static AstNode* maskLsb(AstNode* nodep) {
         if (yieldsOneOrZero(nodep)) return nodep;
         // Otherwise apply masking
-        AstNode* const maskp = new AstConst(nodep->fileline(), AstConst::BitTrue());
+        AstNode* const maskp = new AstConst{nodep->fileline(), AstConst::BitTrue()};
         // Mask on left, as conventional
-        return new AstAnd(nodep->fileline(), maskp, nodep);
+        return new AstAnd{nodep->fileline(), maskp, nodep};
     }
 
-    // Fold the RHS expression assuming the given condition state. Unlink bits
-    // from the RHS which is only used once, and can be reused. What remains
-    // of the RHS is expected to be deleted by the caller.
+    // Fold the RHS expression of an assignment assuming the given condition state.
+    // Unlink bits from the RHS which is only used once, and can be reused (is an unomdified
+    // sub-tree). What remains of the RHS is expected to be deleted by the caller.
     AstNode* foldAndUnlink(AstNode* rhsp, bool condTrue) {
         if (rhsp->sameTree(m_mgCondp)) {
-            return new AstConst(rhsp->fileline(), AstConst::BitTrue{}, condTrue);
-        } else if (const AstNodeCond* const condp = extractCond(rhsp)) {
+            return new AstConst{rhsp->fileline(), AstConst::BitTrue{}, condTrue};
+        } else if (const AstNodeCond* const condp = extractCondFromRhs(rhsp)) {
             AstNode* const resp
                 = condTrue ? condp->expr1p()->unlinkFrBack() : condp->expr2p()->unlinkFrBack();
-            if (condp == rhsp) {  //
-                return resp;
-            }
+            if (condp == rhsp) return resp;
             if (const AstAnd* const andp = VN_CAST(rhsp, And)) {
                 UASSERT_OBJ(andp->rhsp() == condp, rhsp, "Should not try to fold this");
                 return new AstAnd{andp->fileline(), andp->lhsp()->cloneTree(false), resp};
@@ -227,17 +606,18 @@ private:
                 return condTrue ? maskLsb(andp->lhsp()->unlinkFrBack())
                                 : new AstConst{rhsp->fileline(), AstConst::BitFalse()};
             }
-        } else if (VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef) || VN_IS(rhsp, Const)) {
+        } else if (VN_IS(rhsp, ArraySel) || VN_IS(rhsp, WordSel) || VN_IS(rhsp, VarRef)
+                   || VN_IS(rhsp, Const)) {
             return rhsp->cloneTree(false);
         }
-        rhsp->dumpTree("Don't know how to fold expression: ");
-        rhsp->v3fatalSrc("Don't know how to fold expression");
+        // LCOV_EXCL_START
+        if (debug()) rhsp->dumpTree("Don't know how to fold expression: ");
+        rhsp->v3fatalSrc("Should not try to fold this during conditional merging");
+        // LCOV_EXCL_STOP
     }
 
-    void mergeEnd(int lineno) {
-        UASSERT(m_mgFirstp, "mergeEnd without list " << lineno);
-        // We might want to recursively merge an AstIf. We stash it in this variable.
-        const AstNodeIf* recursivep = nullptr;
+    void mergeEnd() {
+        UASSERT(m_mgFirstp, "mergeEnd without list");
         // Drop leading cheap nodes. These were only added in the hope of finding
         // an earlier reduced form, but we failed to do so.
         while (m_mgFirstp->user2() && m_mgFirstp != m_mgLastp) {
@@ -254,8 +634,11 @@ private:
             m_mgLastp = m_mgLastp->backp();
             --m_listLenght;
             UASSERT_OBJ(m_mgLastp && m_mgLastp->nextp() == nextp, m_mgFirstp,
-                        "Cheap assignment should not be at the front of the list");
+                        "Cheap statement should not be at the front of the list");
         }
+        // If the list contains a single AstNodeIf, we will want to merge its branches.
+        // If so, keep hold of the AstNodeIf in this variable.
+        AstNodeIf* recursivep = nullptr;
         // Merge if list is longer than one node
         if (m_mgFirstp != m_mgLastp) {
             UINFO(6, "MergeCond - First: " << m_mgFirstp << " Last: " << m_mgLastp << endl);
@@ -266,7 +649,7 @@ private:
             // and we also need to keep track of it for comparisons later.
             m_mgCondp = m_mgCondp->cloneTree(false);
             // Create equivalent 'if' statement and insert it before the first node
-            AstIf* const resultp = new AstIf(m_mgCondp->fileline(), m_mgCondp);
+            AstIf* const resultp = new AstIf{m_mgCondp->fileline(), m_mgCondp};
             m_mgFirstp->addHereThisAsNext(resultp);
             // Unzip the list and insert under branches
             AstNode* nextp = m_mgFirstp;
@@ -308,10 +691,12 @@ private:
                     VL_DO_DANGLING(ifp->deleteTree(), ifp);
                 }
             } while (nextp);
-            // Recursively merge the resulting AstIf
-            recursivep = resultp;
-        } else if (const AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
-            // There was nothing to merge this AstNodeIf with, but try to merge it's branches
+            // Merge the branches of the resulting AstIf after re-analysis
+            if (resultp->ifsp()) m_workQueuep->push(resultp->ifsp());
+            if (resultp->elsesp()) m_workQueuep->push(resultp->elsesp());
+        } else if (AstNodeIf* const ifp = VN_CAST(m_mgFirstp, NodeIf)) {
+            // There was nothing to merge this AstNodeIf with, so try to merge its branches.
+            // No re-analysis is required for this, so do it directly below
             recursivep = ifp;
         }
         // Reset state
@@ -321,14 +706,13 @@ private:
         m_mgNextp = nullptr;
         AstNode::user1ClearTree();  // Clear marked variables
         AstNode::user2ClearTree();
-        // Merge recursively within the branches
+        // Merge recursively within the branches of an un-merged AstNodeIF
         if (recursivep) {
             iterateAndNextNull(recursivep->ifsp());
-            // Close list, if there is one at the end of the then branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
             iterateAndNextNull(recursivep->elsesp());
-            // Close list, if there is one at the end of the else branch
-            if (m_mgFirstp) mergeEnd(__LINE__);
+            // Close a pending merge to ensure merge state is
+            // reset as expected at the end of this function
+            if (m_mgFirstp) mergeEnd();
         }
     }
 
@@ -351,47 +735,16 @@ private:
         return false;
     }
 
-    // Check if this node is cheap enough that duplicating it in two branches of an
-    // AstIf and is hence not likely to cause a performance degradation if doing so.
-    bool isCheapNode(AstNode* nodep) const {
-        if (VN_IS(nodep, Comment)) return true;
-        if (const AstNodeAssign* const assignp = VN_CAST(nodep, NodeAssign)) {
-            // Check LHS
-            AstNode* lhsp = assignp->lhsp();
-            while (AstWordSel* const wselp = VN_CAST(lhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                lhsp = wselp->fromp();
-            }
-            // LHS is not a VarRef, so might be expensive
-            if (!VN_IS(lhsp, VarRef)) return false;
-
-            // Check RHS
-            AstNode* rhsp = assignp->rhsp();
-            while (AstWordSel* const wselp = VN_CAST(rhsp, WordSel)) {
-                // WordSel index is not constant, so might be expensive
-                if (!VN_IS(wselp->bitp(), Const)) return false;
-                rhsp = wselp->fromp();
-            }
-            // RHS is not a VarRef or Constant so might be expensive
-            if (!VN_IS(rhsp, VarRef) && !VN_IS(rhsp, Const)) return false;
-
-            // Otherwise it is a cheap assignment
-            return true;
-        }
-        return false;
-    }
-
-    bool addToList(AstNode* nodep, AstNode* condp, int line) {
+    bool addToList(AstNodeStmt* nodep, AstNode* condp) {
         // Set up head of new list if node is first in list
         if (!m_mgFirstp) {
-            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition " << line);
+            UASSERT_OBJ(condp, nodep, "Cannot start new list without condition");
             // Mark variable references in the condition
             condp->foreach<AstVarRef>([](const AstVarRef* nodep) { nodep->varp()->user1(1); });
             // Now check again if mergeable. We need this to pick up assignments to conditions,
             // e.g.: 'c = c ? a : b' at the beginning of the list, which is in fact not mergeable
             // because it updates the condition. We simply bail on these.
-            if (m_checkMergeable(nodep) != Mergeable::YES) {
+            if ((*m_stmtPropertiesp)(nodep).writesConditionVar()) {
                 // Clear marked variables
                 AstNode::user1ClearTree();
                 // We did not add to the list
@@ -400,11 +753,13 @@ private:
             m_mgFirstp = nodep;
             m_mgCondp = condp;
             m_listLenght = 0;
-            // Add any preceding nodes to the list that would allow us to extend the merge range
-            for (;;) {
-                AstNode* const backp = m_mgFirstp->backp();
+            // Add any preceding nodes to the list that would allow us to extend the merge
+            // range
+            while (true) {
+                AstNodeStmt* const backp = VN_CAST(m_mgFirstp->backp(), NodeStmt);
                 if (!backp || backp->nextp() != m_mgFirstp) break;  // Don't move up the tree
-                if (m_checkMergeable(backp) != Mergeable::YES) break;
+                const StmtProperties& props = (*m_stmtPropertiesp)(backp);
+                if (props.m_isFence || props.writesConditionVar()) break;
                 if (isSimplifiableNode(backp)) {
                     ++m_listLenght;
                     m_mgFirstp = backp;
@@ -424,59 +779,53 @@ private:
         // Set up expected next node in list.
         m_mgNextp = nodep->nextp();
         // If last under parent, done with current list
-        if (!m_mgNextp) mergeEnd(__LINE__);
+        if (!m_mgNextp) mergeEnd();
         // We did add to the list
         return true;
     }
 
     // If this node is the next expected node and is helpful to add to the list, do so,
     // otherwise end the current merge. Return ture if added, false if ended merge.
-    bool addIfHelpfulElseEndMerge(AstNode* nodep) {
+    bool addIfHelpfulElseEndMerge(AstNodeStmt* nodep) {
         UASSERT_OBJ(m_mgFirstp, nodep, "List must be open");
         if (m_mgNextp == nodep) {
             if (isSimplifiableNode(nodep)) {
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
             } else if (isCheapNode(nodep)) {
                 nodep->user2(1);
-                if (addToList(nodep, nullptr, __LINE__)) return true;
+                if (addToList(nodep, nullptr)) return true;
             }
         }
         // Not added to list, so we are done with the current list
-        mergeEnd(__LINE__);
+        mergeEnd();
         return false;
     }
 
-    bool checkOrMakeMergeable(AstNode* nodep) {
-        const Mergeable reason = m_checkMergeable(nodep);
-        // If meregeable, we are done
-        if (reason == Mergeable::YES) return true;
-        // Node not mergeable.
-        // If no current list, then this node is just special, move on.
-        if (!m_mgFirstp) return false;
-        // Otherwise finish current list
-        mergeEnd(__LINE__);
-        // If a tree was not mergeable due to an assignment to a condition,
-        // then finishing the current list makes it mergeable again.
-        return reason == Mergeable::NO_COND_ASSIGN;
+    bool checkOrMakeMergeable(const AstNodeStmt* nodep) {
+        const StmtProperties& props = (*m_stmtPropertiesp)(nodep);
+        if (props.m_isFence) return false;  // Fence node never mergeable
+        // If the statement writes a condition variable of a pending merge,
+        // we must end the pending merge
+        if (m_mgFirstp && props.writesConditionVar()) mergeEnd();
+        return true;  // Now surely mergeable
     }
 
-    void mergeEndIfIncompatible(AstNode* nodep, AstNode* condp) {
+    void mergeEndIfIncompatible(const AstNode* nodep, const AstNode* condp) {
         if (m_mgFirstp && (m_mgNextp != nodep || !condp->sameTree(m_mgCondp))) {
             // Node in different list, or has different condition. Finish current list.
-            mergeEnd(__LINE__);
+            mergeEnd();
         }
     }
 
     // VISITORS
     virtual void visit(AstNodeAssign* nodep) override {
-        AstNode* const rhsp = nodep->rhsp();
-        if (const AstNodeCond* const condp = extractCond(rhsp)) {
+        if (AstNode* const condp = (*m_stmtPropertiesp)(nodep).m_condp) {
             // Check if mergeable
             if (!checkOrMakeMergeable(nodep)) return;
             // Close potentially incompatible pending merge
-            mergeEndIfIncompatible(nodep, condp->condp());
+            mergeEndIfIncompatible(nodep, condp);
             // Add current node
-            addToList(nodep, condp->condp(), __LINE__);
+            addToList(nodep, condp);
         } else if (m_mgFirstp) {
             addIfHelpfulElseEndMerge(nodep);
         }
@@ -493,21 +842,22 @@ private:
         // Close potentially incompatible pending merge
         mergeEndIfIncompatible(nodep, nodep->condp());
         // Add current node
-        addToList(nodep, nodep->condp(), __LINE__);
+        addToList(nodep, nodep->condp());
+    }
+
+    virtual void visit(AstNodeStmt* nodep) override {
+        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
+        iterateChildren(nodep);
+    }
+
+    virtual void visit(AstCFunc* nodep) override {
+        // Merge function body
+        if (nodep->stmtsp()) process(nodep->stmtsp());
     }
 
     // For speed, only iterate what is necessary.
     virtual void visit(AstNetlist* nodep) override { iterateAndNextNull(nodep->modulesp()); }
     virtual void visit(AstNodeModule* nodep) override { iterateAndNextNull(nodep->stmtsp()); }
-    virtual void visit(AstCFunc* nodep) override {
-        iterateChildren(nodep);
-        // Close list, if there is one at the end of the function
-        if (m_mgFirstp) mergeEnd(__LINE__);
-    }
-    virtual void visit(AstNodeStmt* nodep) override {
-        if (m_mgFirstp && addIfHelpfulElseEndMerge(nodep)) return;
-        iterateChildren(nodep);
-    }
     virtual void visit(AstNode* nodep) override {}
 
 public:
@@ -520,6 +870,8 @@ public:
     }
 };
 
+}  // namespace
+
 //######################################################################
 // MergeConditionals class functions
 
diff --git a/src/V3OptionParser.cpp b/src/V3OptionParser.cpp
index 4439ba53d..d98b4fd90 100644
--- a/src/V3OptionParser.cpp
+++ b/src/V3OptionParser.cpp
@@ -30,6 +30,7 @@ struct V3OptionParser::Impl {
     // Setting for isOnOffAllowed() and isPartialMatchAllowed()
     enum class en : uint8_t {
         NONE,  // "-opt"
+        FONOFF,  // "-fopt" and "-fno-opt"
         ONOFF,  // "-opt" and "-no-opt"
         VALUE  // "-opt val"
     };
@@ -39,6 +40,7 @@ struct V3OptionParser::Impl {
         bool m_undocumented = false;  // This option is not documented
     public:
         virtual bool isValueNeeded() const override final { return MODE == en::VALUE; }
+        virtual bool isFOnOffAllowed() const override final { return MODE == en::FONOFF; }
         virtual bool isOnOffAllowed() const override final { return MODE == en::ONOFF; }
         virtual bool isPartialMatchAllowed() const override final { return ALLOW_PARTIAL_MATCH; }
         virtual bool isUndocumented() const override { return m_undocumented; }
@@ -47,6 +49,7 @@ struct V3OptionParser::Impl {
 
     // Actual action classes
     template <typename T> class ActionSet;  // "-opt" for bool-ish, "-opt val" for int and string
+    template <typename BOOL> class ActionFOnOff;  // "-fopt" and "-fno-opt" for bool-ish
     template <typename BOOL> class ActionOnOff;  // "-opt" and "-no-opt" for bool-ish
     class ActionCbCall;  // Callback without argument for "-opt"
     class ActionCbOnOff;  // Callback for "-opt" and "-no-opt"
@@ -80,6 +83,7 @@ V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, VOptionBool, m_valp->setTrueOrFalse(tru
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, int, *m_valp = std::atoi(argp), en::VALUE);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionSet, string, *m_valp = argp, en::VALUE);
 
+V3OPTION_PARSER_DEF_ACT_CLASS(ActionFOnOff, bool, *m_valp = !hasPrefixFNo(optp), en::FONOFF);
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, bool, *m_valp = !hasPrefixNo(optp), en::ONOFF);
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_ACT_CLASS(ActionOnOff, VOptionBool, m_valp->setTrueOrFalse(!hasPrefixNo(optp)),
@@ -117,12 +121,23 @@ V3OPTION_PARSER_DEF_ACT_CB_CLASS(ActionCbPartialMatchVal, void(const char*, cons
 
 V3OptionParser::ActionIfs* V3OptionParser::find(const char* optp) {
     const auto it = m_pimpl->m_options.find(optp);
-    if (it != m_pimpl->m_options.end()) return it->second.get();
+    if (it != m_pimpl->m_options.end()) return it->second.get();  // Exact match
     for (auto&& act : m_pimpl->m_options) {
+        if (act.second->isFOnOffAllowed()) {  // Find starts with "-fno"
+            if (const char* const nop
+                = VString::startsWith(optp, "-fno-") ? (optp + strlen("-fno-")) : nullptr) {
+                if (act.first.substr(strlen("-f"), std::string::npos)
+                    == nop) {  // [-f]opt = [-fno-]opt
+                    return act.second.get();
+                }
+            }
+        }
         if (act.second->isOnOffAllowed()) {  // Find starts with "-no"
-            const char* const nop = VString::startsWith(optp, "-no") ? (optp + 3) : nullptr;
-            if (nop && (act.first == nop || act.first == (string{"-"} + nop))) {
-                return act.second.get();
+            if (const char* const nop
+                = VString::startsWith(optp, "-no") ? (optp + strlen("-no")) : nullptr) {
+                if (act.first == nop || act.first == (string{"-"} + nop)) {
+                    return act.second.get();
+                }
             }
         } else if (act.second->isPartialMatchAllowed()) {
             if (VString::startsWith(optp, act.first)) return act.second.get();
@@ -143,6 +158,12 @@ V3OptionParser::ActionIfs& V3OptionParser::add(const std::string& opt, ARG arg)
     return *insertedResult.first->second;
 }
 
+bool V3OptionParser::hasPrefixFNo(const char* strp) {
+    UASSERT(strp[0] == '-', strp << " does not start with '-'");
+    if (strp[1] == '-') ++strp;
+    return VString::startsWith(strp, "-fno");
+}
+
 bool V3OptionParser::hasPrefixNo(const char* strp) {
     UASSERT(strp[0] == '-', strp << " does not start with '-'");
     if (strp[1] == '-') ++strp;
@@ -178,6 +199,10 @@ void V3OptionParser::finalize() {
     for (auto&& opt : m_pimpl->m_options) {
         if (opt.second->isUndocumented()) continue;
         m_pimpl->m_spellCheck.pushCandidate(opt.first);
+        if (opt.second->isFOnOffAllowed()) {
+            m_pimpl->m_spellCheck.pushCandidate(
+                "-fno-" + opt.first.substr(strlen("-f"), std::string::npos));
+        }
         if (opt.second->isOnOffAllowed()) m_pimpl->m_spellCheck.pushCandidate("-no" + opt.first);
     }
     m_pimpl->m_isFinalized = true;
@@ -202,6 +227,7 @@ V3OPTION_PARSER_DEF_OP(Set, VOptionBool*, ActionSet<VOptionBool>)
 #endif
 V3OPTION_PARSER_DEF_OP(Set, int*, ActionSet<int>)
 V3OPTION_PARSER_DEF_OP(Set, string*, ActionSet<string>)
+V3OPTION_PARSER_DEF_OP(FOnOff, bool*, ActionFOnOff<bool>)
 V3OPTION_PARSER_DEF_OP(OnOff, bool*, ActionOnOff<bool>)
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
 V3OPTION_PARSER_DEF_OP(OnOff, VOptionBool*, ActionOnOff<VOptionBool>)
diff --git a/src/V3OptionParser.h b/src/V3OptionParser.h
index fc199264f..e77f43a26 100644
--- a/src/V3OptionParser.h
+++ b/src/V3OptionParser.h
@@ -66,6 +66,7 @@ private:
     // METHODS
     ActionIfs* find(const char* optp);
     template <class ACT, class ARG> ActionIfs& add(const string& opt, ARG arg);
+    static bool hasPrefixFNo(const char* strp);  // Returns true if strp starts with "-fno"
     static bool hasPrefixNo(const char* strp);  // Returns true if strp starts with "-no"
 
 public:
@@ -87,6 +88,7 @@ class V3OptionParser::ActionIfs VL_NOT_FINAL {
 public:
     virtual ~ActionIfs() = default;
     virtual bool isValueNeeded() const = 0;  // Need val of "-opt val"
+    virtual bool isFOnOffAllowed() const = 0;  // true if "-fno-opt" is allowd
     virtual bool isOnOffAllowed() const = 0;  // true if "-no-opt" is allowd
     virtual bool isPartialMatchAllowed() const = 0;  // true if "-Wno-" matches "-Wno-fatal"
     virtual bool isUndocumented() const = 0;  // Will not be suggested in typo
@@ -101,13 +103,15 @@ class V3OptionParser::AppendHelper final {
 public:
     // TYPES
     // Tag to specify which operator() to call
-    struct Set {};  // For ActionSet
+    struct FOnOff {};  // For ActionFOnOff
     struct OnOff {};  // For ActionOnOff
+    struct Set {};  // For ActionSet
+
     struct CbCall {};  // For ActionCbCall
-    struct CbOnOff {};  // For ActionOnOff
-    struct CbVal {};  // For ActionCbVal
+    struct CbOnOff {};  // For ActionOnOff of ActionFOnOff
     struct CbPartialMatch {};  // For ActionCbPartialMatch
     struct CbPartialMatchVal {};  // For ActionCbPartialMatchVal
+    struct CbVal {};  // For ActionCbVal
 
 private:
     // MEMBERS
@@ -122,6 +126,7 @@ public:
     ActionIfs& operator()(const char* optp, Set, int*) const;
     ActionIfs& operator()(const char* optp, Set, string*) const;
 
+    ActionIfs& operator()(const char* optp, FOnOff, bool*) const;
     ActionIfs& operator()(const char* optp, OnOff, bool*) const;
 #ifndef V3OPTION_PARSER_NO_VOPTION_BOOL
     ActionIfs& operator()(const char* optp, OnOff, VOptionBool*) const;
@@ -144,13 +149,14 @@ public:
 
 #define V3OPTION_PARSER_DECL_TAGS \
     const auto Set VL_ATTR_UNUSED = V3OptionParser::AppendHelper::Set{}; \
+    const auto FOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::FOnOff{}; \
     const auto OnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::OnOff{}; \
     const auto CbCall VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbCall{}; \
     const auto CbOnOff VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbOnOff{}; \
-    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{}; \
     const auto CbPartialMatch VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbPartialMatch{}; \
     const auto CbPartialMatchVal VL_ATTR_UNUSED \
-        = V3OptionParser::AppendHelper::CbPartialMatchVal {}
+        = V3OptionParser::AppendHelper::CbPartialMatchVal{}; \
+    const auto CbVal VL_ATTR_UNUSED = V3OptionParser::AppendHelper::CbVal{};
 
 //######################################################################
 
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index 88e1b4d31..b3abe29f4 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -775,8 +775,16 @@ void V3Options::notify() {
             && !v3Global.opt.xmlOnly());
     }
 
-    // --trace-threads implies --threads 1 unless explicitly specified
-    if (traceThreads() && !threads()) m_threads = 1;
+    if (trace()) {
+        // With --trace-fst, --trace-threads implies --threads 1 unless explicitly specified
+        if (traceFormat().fst() && traceThreads() && !threads()) m_threads = 1;
+
+        // With --trace, --trace-threads is ignored
+        if (traceFormat().vcd()) m_traceThreads = threads() ? 1 : 0;
+    }
+
+    UASSERT(!(useTraceParallel() && useTraceOffload()),
+            "Cannot use both parallel and offloaded tracing");
 
     // Default split limits if not specified
     if (m_outputSplitCFuncs < 0) m_outputSplitCFuncs = m_outputSplit;
@@ -1075,6 +1083,28 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
     });
     DECL_OPTION("-flatten", OnOff, &m_flatten);
 
+    DECL_OPTION("-facyc-simp", FOnOff, &m_fAcycSimp);
+    DECL_OPTION("-fassemble", FOnOff, &m_fAssemble);
+    DECL_OPTION("-fcase", FOnOff, &m_fCase);
+    DECL_OPTION("-fcombine", FOnOff, &m_fCombine);
+    DECL_OPTION("-fconst", FOnOff, &m_fConst);
+    DECL_OPTION("-fconst-bit-op-tree", FOnOff, &m_fConstBitOpTree);
+    DECL_OPTION("-fdedup", FOnOff, &m_fDedupe);
+    DECL_OPTION("-fexpand", FOnOff, &m_fExpand);
+    DECL_OPTION("-fgate", FOnOff, &m_fGate);
+    DECL_OPTION("-finline", FOnOff, &m_fInline);
+    DECL_OPTION("-flife", FOnOff, &m_fLife);
+    DECL_OPTION("-flife-post", FOnOff, &m_fLifePost);
+    DECL_OPTION("-flocalize", FOnOff, &m_fLocalize);
+    DECL_OPTION("-fmerge-cond", FOnOff, &m_fMergeCond);
+    DECL_OPTION("-fmerge-const-pool", FOnOff, &m_fMergeConstPool);
+    DECL_OPTION("-freloop", FOnOff, &m_fReloop);
+    DECL_OPTION("-freorder", FOnOff, &m_fReorder);
+    DECL_OPTION("-fsplit", FOnOff, &m_fSplit);
+    DECL_OPTION("-fsubst", FOnOff, &m_fSubst);
+    DECL_OPTION("-fsubst-const", FOnOff, &m_fSubstConst);
+    DECL_OPTION("-ftable", FOnOff, &m_fTable);
+
     DECL_OPTION("-G", CbPartialMatch, [this](const char* optp) { addParameter(optp, false); });
     DECL_OPTION("-gate-stmts", Set, &m_gateStmts);
     DECL_OPTION("-gdb", CbCall, []() {});  // Processed only in bin/verilator shell
@@ -1144,50 +1174,51 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
         }
     });
     DECL_OPTION("-max-num-width", Set, &m_maxNumWidth);
-    DECL_OPTION("-merge-const-pool", OnOff, &m_mergeConstPool);
     DECL_OPTION("-mod-prefix", Set, &m_modPrefix);
 
-    DECL_OPTION("-O", CbPartialMatch, [this](const char* optp) {
-        // Optimization
+    DECL_OPTION("-O0", CbCall, [this]() { optimize(0); });
+    DECL_OPTION("-O1", CbCall, [this]() { optimize(1); });
+    DECL_OPTION("-O2", CbCall, [this]() { optimize(2); });
+    DECL_OPTION("-O3", CbCall, [this]() { optimize(3); });
+
+    DECL_OPTION("-O", CbPartialMatch, [this, fl](const char* optp) {
+        // Optimization, e.g. -O1rX
+        // LCOV_EXCL_START
+        fl->v3warn(DEPRECATED, "Option -O<letter> is deprecated. "
+                               "Use -f<optimization> or -fno-<optimization> instead.");
         for (const char* cp = optp; *cp; ++cp) {
             const bool flag = isupper(*cp);
             switch (tolower(*cp)) {
-            case '0': optimize(0); break;  // 0=all off
-            case '1': optimize(1); break;  // 1=all on
-            case '2': optimize(2); break;  // 2=not used
-            case '3': optimize(3); break;  // 3=high
-            case 'a': m_oTable = flag; break;
-            case 'b': m_oCombine = flag; break;
-            case 'c': m_oConst = flag; break;
-            case 'd': m_oDedupe = flag; break;
-            case 'e': m_oCase = flag; break;
-            //    f
-            case 'g': m_oGate = flag; break;
-            //    h
-            case 'i': m_oInline = flag; break;
-            //    j
-            case 'k': m_oSubstConst = flag; break;
-            case 'l': m_oLife = flag; break;
-            case 'm': m_oAssemble = flag; break;
-            //    n
-            case 'o':
-                m_oConstBitOpTree = flag;
-                break;  // Can remove ~2022-01 when stable
-            //    o will be used as an escape for a second character of optimization disables
+            case '0': optimize(0); break;
+            case '1': optimize(1); break;
+            case '2': optimize(2); break;
+            case '3': optimize(3); break;
+            case 'a': m_fTable = flag; break;  // == -fno-table
+            case 'b': m_fCombine = flag; break;  // == -fno-combine
+            case 'c': m_fConst = flag; break;  // == -fno-const
+            case 'd': m_fDedupe = flag; break;  // == -fno-dedup
+            case 'e': m_fCase = flag; break;  // == -fno-case
+            case 'g': m_fGate = flag; break;  // == -fno-gate
+            case 'i': m_fInline = flag; break;  // == -fno-inline
+            case 'k': m_fSubstConst = flag; break;  // == -fno-subst-const
+            case 'l': m_fLife = flag; break;  // == -fno-life
+            case 'm': m_fAssemble = flag; break;  // == -fno-assemble
+            case 'o': m_fConstBitOpTree = flag; break;  // == -fno-const-bit-op-tree
             case 'p':
                 m_public = !flag;
                 break;  // With -Op so flag=0, we want public on so few optimizations done
-            //    q
-            case 'r': m_oReorder = flag; break;
-            case 's': m_oSplit = flag; break;
-            case 't': m_oLifePost = flag; break;
-            case 'u': m_oSubst = flag; break;
-            case 'v': m_oReloop = flag; break;
-            case 'w': m_oMergeCond = flag; break;
-            case 'x': m_oExpand = flag; break;
-            case 'y': m_oAcycSimp = flag; break;
-            case 'z': m_oLocalize = flag; break;
-            default: break;  // No error, just ignore
+            case 'r': m_fReorder = flag; break;  // == -fno-reorder
+            case 's': m_fSplit = flag; break;  // == -fno-split
+            case 't': m_fLifePost = flag; break;  // == -fno-life-post
+            case 'u': m_fSubst = flag; break;  // == -fno-subst
+            case 'v': m_fReloop = flag; break;  // == -fno-reloop
+            case 'w': m_fMergeCond = flag; break;  // == -fno-merge-cond
+            case 'x': m_fExpand = flag; break;  // == -fno-expand
+            case 'y': m_fAcycSimp = flag; break;  // == -fno-acyc-simp
+            case 'z': m_fLocalize = flag; break;  // == -fno-localize
+            default:
+                break;  // No error, just ignore
+                // LCOV_EXCL_STOP
             }
         }
     });
@@ -1352,7 +1383,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
     DECL_OPTION("-trace-threads", CbVal, [this, fl](const char* valp) {
         m_trace = true;
         m_traceThreads = std::atoi(valp);
-        if (m_traceThreads < 0) fl->v3fatal("--trace-threads must be >= 0: " << valp);
+        if (m_traceThreads < 1) fl->v3fatal("--trace-threads must be >= 1: " << valp);
     });
     DECL_OPTION("-trace-underscore", OnOff, &m_traceUnderscore);
 
@@ -1781,26 +1812,26 @@ int V3Options::dumpTreeLevel(const string& srcfile_path) {
 void V3Options::optimize(int level) {
     // Set all optimizations to on/off
     const bool flag = level > 0;
-    m_oAcycSimp = flag;
-    m_oAssemble = flag;
-    m_oCase = flag;
-    m_oCombine = flag;
-    m_oConst = flag;
-    m_oConstBitOpTree = flag;
-    m_oDedupe = flag;
-    m_oExpand = flag;
-    m_oGate = flag;
-    m_oInline = flag;
-    m_oLife = flag;
-    m_oLifePost = flag;
-    m_oLocalize = flag;
-    m_oMergeCond = flag;
-    m_oReloop = flag;
-    m_oReorder = flag;
-    m_oSplit = flag;
-    m_oSubst = flag;
-    m_oSubstConst = flag;
-    m_oTable = flag;
+    m_fAcycSimp = flag;
+    m_fAssemble = flag;
+    m_fCase = flag;
+    m_fCombine = flag;
+    m_fConst = flag;
+    m_fConstBitOpTree = flag;
+    m_fDedupe = flag;
+    m_fExpand = flag;
+    m_fGate = flag;
+    m_fInline = flag;
+    m_fLife = flag;
+    m_fLifePost = flag;
+    m_fLocalize = flag;
+    m_fMergeCond = flag;
+    m_fReloop = flag;
+    m_fReorder = flag;
+    m_fSplit = flag;
+    m_fSubst = flag;
+    m_fSubstConst = flag;
+    m_fTable = flag;
     // And set specific optimization levels
     if (level >= 3) {
         m_inlineMult = -1;  // Maximum inlining
diff --git a/src/V3Options.h b/src/V3Options.h
index dd71a1b3b..f5ed6df29 100644
--- a/src/V3Options.h
+++ b/src/V3Options.h
@@ -246,7 +246,6 @@ private:
     bool m_lintOnly = false;        // main switch: --lint-only
     bool m_gmake = false;           // main switch: --make gmake
     bool m_main = false;            // main swithc: --main
-    bool m_mergeConstPool = true;   // main switch: --merge-const-pool
     bool m_outFormatOk = false;     // main switch: --cc, --sc or --sp was specified
     bool m_pedantic = false;        // main switch: --Wpedantic
     bool m_pinsScUint = false;      // main switch: --pins-sc-uint
@@ -340,27 +339,27 @@ private:
     V3LangCode  m_defaultLanguage;      // main switch: --language
 
     // MEMBERS (optimizations)
-    //                          // main switch: -Op: --public
-    bool        m_oAcycSimp;    // main switch: -Oy: acyclic pre-optimizations
-    bool        m_oAssemble;    // main switch: -Om: assign assemble
-    bool        m_oCase;        // main switch: -Oe: case tree conversion
-    bool        m_oCombine;     // main switch: -Ob: common icode packing
-    bool        m_oConst;       // main switch: -Oc: constant folding
-    bool        m_oConstBitOpTree;  // main switch: -Oo: constant bit op tree
-    bool        m_oDedupe;      // main switch: -Od: logic deduplication
-    bool        m_oExpand;      // main switch: -Ox: expansion of C macros
-    bool        m_oGate;        // main switch: -Og: gate wire elimination
-    bool        m_oInline;      // main switch: -Oi: module inlining
-    bool        m_oLife;        // main switch: -Ol: variable lifetime
-    bool        m_oLifePost;    // main switch: -Ot: delayed assignment elimination
-    bool        m_oLocalize;    // main switch: -Oz: convert temps to local variables
-    bool        m_oMergeCond;   // main switch: -Ob: merge conditionals
-    bool        m_oReloop;      // main switch: -Ov: reform loops
-    bool        m_oReorder;     // main switch: -Or: reorder assignments in blocks
-    bool        m_oSplit;       // main switch: -Os: always assignment splitting
-    bool        m_oSubst;       // main switch: -Ou: substitute expression temp values
-    bool        m_oSubstConst;  // main switch: -Ok: final constant substitution
-    bool        m_oTable;       // main switch: -Oa: lookup table creation
+    bool m_fAcycSimp;    // main switch: -fno-acyc-simp: acyclic pre-optimizations
+    bool m_fAssemble;    // main switch: -fno-assemble: assign assemble
+    bool m_fCase;        // main switch: -fno-case: case tree conversion
+    bool m_fCombine;     // main switch: -fno-combine: common icode packing
+    bool m_fConst;       // main switch: -fno-const: constant folding
+    bool m_fConstBitOpTree;  // main switch: -fno-const-bit-op-tree constant bit op tree
+    bool m_fDedupe;      // main switch: -fno-dedupe: logic deduplication
+    bool m_fExpand;      // main switch: -fno-expand: expansion of C macros
+    bool m_fGate;        // main switch: -fno-gate: gate wire elimination
+    bool m_fInline;      // main switch: -fno-inline: module inlining
+    bool m_fLife;        // main switch: -fno-life: variable lifetime
+    bool m_fLifePost;    // main switch: -fno-life-post: delayed assignment elimination
+    bool m_fLocalize;    // main switch: -fno-localize: convert temps to local variables
+    bool m_fMergeCond;   // main switch: -fno-merge-cond: merge conditionals
+    bool m_fMergeConstPool = true;  // main switch: --fmerge-const-pool
+    bool m_fReloop;      // main switch: -fno-reloop: reform loops
+    bool m_fReorder;     // main switch: -fno-reorder: reorder assignments in blocks
+    bool m_fSplit;       // main switch: -fno-split: always assignment splitting
+    bool m_fSubst;       // main switch: -fno-subst: substitute expression temp values
+    bool m_fSubstConst;  // main switch: -fno-subst-const: final constant substitution
+    bool m_fTable;       // main switch: -fno-table: lookup table creation
     // clang-format on
 
     bool m_available = false;  // Set to true at the end of option parsing
@@ -458,7 +457,6 @@ public:
     bool traceStructs() const { return m_traceStructs; }
     bool traceUnderscore() const { return m_traceUnderscore; }
     bool main() const { return m_main; }
-    bool mergeConstPool() const { return m_mergeConstPool; }
     bool outFormatOk() const { return m_outFormatOk; }
     bool keepTempFiles() const { return (V3Error::debugDefault() != 0); }
     bool pedantic() const { return m_pedantic; }
@@ -516,8 +514,10 @@ public:
     int traceMaxArray() const { return m_traceMaxArray; }
     int traceMaxWidth() const { return m_traceMaxWidth; }
     int traceThreads() const { return m_traceThreads; }
-    bool useTraceOffloadThread() const {
-        return traceThreads() == 0 ? 0 : traceThreads() - traceFormat().fst();
+    bool useTraceOffload() const { return trace() && traceFormat().fst() && traceThreads() > 1; }
+    bool useTraceParallel() const { return trace() && traceFormat().vcd() && threads() > 1; }
+    unsigned vmTraceThreads() const {
+        return useTraceParallel() ? threads() : useTraceOffload() ? 1 : 0;
     }
     int unrollCount() const { return m_unrollCount; }
     int unrollStmts() const { return m_unrollStmts; }
@@ -571,26 +571,27 @@ public:
     bool isNoClocker(const string& signame) const;
 
     // ACCESSORS (optimization options)
-    bool oAcycSimp() const { return m_oAcycSimp; }
-    bool oAssemble() const { return m_oAssemble; }
-    bool oCase() const { return m_oCase; }
-    bool oCombine() const { return m_oCombine; }
-    bool oConst() const { return m_oConst; }
-    bool oConstBitOpTree() const { return m_oConstBitOpTree; }
-    bool oDedupe() const { return m_oDedupe; }
-    bool oExpand() const { return m_oExpand; }
-    bool oGate() const { return m_oGate; }
-    bool oInline() const { return m_oInline; }
-    bool oLife() const { return m_oLife; }
-    bool oLifePost() const { return m_oLifePost; }
-    bool oLocalize() const { return m_oLocalize; }
-    bool oMergeCond() const { return m_oMergeCond; }
-    bool oReloop() const { return m_oReloop; }
-    bool oReorder() const { return m_oReorder; }
-    bool oSplit() const { return m_oSplit; }
-    bool oSubst() const { return m_oSubst; }
-    bool oSubstConst() const { return m_oSubstConst; }
-    bool oTable() const { return m_oTable; }
+    bool fAcycSimp() const { return m_fAcycSimp; }
+    bool fAssemble() const { return m_fAssemble; }
+    bool fCase() const { return m_fCase; }
+    bool fCombine() const { return m_fCombine; }
+    bool fConst() const { return m_fConst; }
+    bool fConstBitOpTree() const { return m_fConstBitOpTree; }
+    bool fDedupe() const { return m_fDedupe; }
+    bool fExpand() const { return m_fExpand; }
+    bool fGate() const { return m_fGate; }
+    bool fInline() const { return m_fInline; }
+    bool fLife() const { return m_fLife; }
+    bool fLifePost() const { return m_fLifePost; }
+    bool fLocalize() const { return m_fLocalize; }
+    bool fMergeCond() const { return m_fMergeCond; }
+    bool fMergeConstPool() const { return m_fMergeConstPool; }
+    bool fReloop() const { return m_fReloop; }
+    bool fReorder() const { return m_fReorder; }
+    bool fSplit() const { return m_fSplit; }
+    bool fSubst() const { return m_fSubst; }
+    bool fSubstConst() const { return m_fSubstConst; }
+    bool fTable() const { return m_fTable; }
 
     string traceClassBase() const { return m_traceFormat.classBase(); }
     string traceClassLang() const { return m_traceFormat.classBase() + (systemC() ? "Sc" : "C"); }
diff --git a/src/V3Premit.cpp b/src/V3Premit.cpp
index 7501cd456..836b7c814 100644
--- a/src/V3Premit.cpp
+++ b/src/V3Premit.cpp
@@ -133,7 +133,7 @@ private:
                                   && !constp->num().isString();  // Not a string
         if (useConstPool) {
             // Extract into constant pool.
-            const bool merge = v3Global.opt.mergeConstPool();
+            const bool merge = v3Global.opt.fMergeConstPool();
             varp = v3Global.rootp()->constPoolp()->findConst(constp, merge)->varp();
             nodep->deleteTree();
             ++m_extractedToConstPool;
diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp
index 61d009b6f..9fa1b099a 100644
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@@ -180,6 +180,10 @@ private:
     TraceActivityVertex* const m_alwaysVtxp;  // "Always trace" vertex
     bool m_finding = false;  // Pass one of algorithm?
 
+    // Trace parallelism. Only VCD tracing can be parallelized at this time.
+    const uint32_t m_parallelism
+        = v3Global.opt.useTraceParallel() ? static_cast<uint32_t>(v3Global.opt.threads()) : 1;
+
     VDouble0 m_statUniqSigs;  // Statistic tracking
     VDouble0 m_statUniqCodes;  // Statistic tracking
 
@@ -388,7 +392,7 @@ private:
                 if (!it->second->duplicatep()) {
                     uint32_t cost = 0;
                     const AstTraceDecl* const declp = it->second->nodep();
-                    // The number of comparisons required by tracep->chg*
+                    // The number of comparisons required by bufp->chg*
                     cost += declp->isWide() ? declp->codeInc() : 1;
                     // Arrays are traced by element
                     cost *= declp->arrayRange().ranged() ? declp->arrayRange().elements() : 1;
@@ -494,7 +498,7 @@ private:
         };
         if (isTopFunc) {
             // Top functions
-            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes("void* voidSelf, " + v3Global.opt.traceClassBase() + "::Buffer* bufp");
             addInitStr(voidSelfAssign(m_topModp));
             addInitStr(symClassAssign());
             // Add global activity check to change dump functions
@@ -508,32 +512,33 @@ private:
                 m_regFuncp->addStmtsp(new AstText(flp, "tracep->addChgCb(", true));
             }
             m_regFuncp->addStmtsp(new AstAddrOfCFunc(flp, funcp));
-            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf);\n", true));
+            const string threadPool{m_parallelism > 1 ? "vlSymsp->__Vm_threadPoolp" : "nullptr"};
+            m_regFuncp->addStmtsp(new AstText(flp, ", vlSelf, " + threadPool + ");\n", true));
         } else {
             // Sub functions
-            funcp->argTypes(v3Global.opt.traceClassBase() + "* tracep");
+            funcp->argTypes(v3Global.opt.traceClassBase() + "::Buffer* bufp");
             // Setup base references. Note in rare occasions we can end up with an empty trace
             // sub function, hence the VL_ATTR_UNUSED attributes.
             if (full) {
                 // Full dump sub function
                 addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                           "tracep->oldp(vlSymsp->__Vm_baseCode);\n");
+                           "bufp->oldp(vlSymsp->__Vm_baseCode);\n");
             } else {
                 // Change dump sub function
-                if (v3Global.opt.useTraceOffloadThread()) {
+                if (v3Global.opt.useTraceOffload()) {
                     addInitStr("const uint32_t base VL_ATTR_UNUSED = "
                                "vlSymsp->__Vm_baseCode + "
                                + cvtToStr(baseCode) + ";\n");
-                    addInitStr("if (false && tracep) {}  // Prevent unused\n");
+                    addInitStr("if (false && bufp) {}  // Prevent unused\n");
                 } else {
                     addInitStr("uint32_t* const oldp VL_ATTR_UNUSED = "
-                               "tracep->oldp(vlSymsp->__Vm_baseCode + "
+                               "bufp->oldp(vlSymsp->__Vm_baseCode + "
                                + cvtToStr(baseCode) + ");\n");
                 }
             }
             // Add call to top function
             AstCCall* const callp = new AstCCall(funcp->fileline(), funcp);
-            callp->argTypes("tracep");
+            callp->argTypes("bufp");
             topFuncp->addStmtsp(callp);
         }
         // Done
@@ -728,7 +733,7 @@ private:
         // We will split functions such that each have to dump roughly the same amount of data
         // for this we need to keep tack of the number of codes used by the trace functions.
         uint32_t nFullCodes = 0;  // Number of non-duplicate codes (need to go into full* dump)
-        uint32_t nChgCodes = 0;  // Number of non-consant codes (need to go in to chg* dump)
+        uint32_t nChgCodes = 0;  // Number of non-constant codes (need to go in to chg* dump)
         sortTraces(traces, nFullCodes, nChgCodes);
 
         UINFO(5, "nFullCodes: " << nFullCodes << " nChgCodes: " << nChgCodes << endl);
@@ -747,13 +752,11 @@ private:
         m_regFuncp->isLoose(true);
         m_topScopep->addActivep(m_regFuncp);
 
-        const int parallelism = 1;  // Note: will bump this later, code below works for any value
-
         // Create the full dump functions, also allocates signal numbers
-        createFullTraceFunction(traces, nFullCodes, parallelism);
+        createFullTraceFunction(traces, nFullCodes, m_parallelism);
 
         // Create the incremental dump functions
-        createChgTraceFunctions(traces, nChgCodes, parallelism);
+        createChgTraceFunctions(traces, nChgCodes, m_parallelism);
 
         // Remove refs to traced values from TraceDecl nodes, these have now moved under
         // TraceInc
diff --git a/src/V3Width.cpp b/src/V3Width.cpp
index 6a76170a1..c35b4270c 100644
--- a/src/V3Width.cpp
+++ b/src/V3Width.cpp
@@ -504,6 +504,7 @@ private:
         //   width: LHS + RHS
         AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
         userIterate(vdtypep, WidthVP(SELF, BOTH).p());
+        // Conversions
         if (VN_IS(vdtypep, QueueDType)) {
             // Queue "element 0" is lhsp, so we need to swap arguments
             auto* const newp = new AstConsQueue(nodep->fileline(), nodep->rhsp()->unlinkFrBack(),
@@ -521,6 +522,16 @@ private:
             userIterateChildren(newp, m_vup);
             return;
         }
+        if (VN_IS(vdtypep, UnpackArrayDType)) {
+            auto* const newp = new AstPattern{nodep->fileline(), nullptr};
+            patConcatConvertRecurse(newp, nodep);
+            nodep->replaceWith(newp);
+            VL_DO_DANGLING(pushDeletep(nodep), nodep);
+            userIterate(newp, m_vup);
+            return;
+        }
+
+        // Concat handling
         if (m_vup->prelim()) {
             if (VN_IS(vdtypep, AssocArrayDType)  //
                 || VN_IS(vdtypep, DynArrayDType)  //
@@ -662,7 +673,8 @@ private:
             }
 
             AstNodeDType* const vdtypep = m_vup->dtypeNullSkipRefp();
-            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)) {
+            if (VN_IS(vdtypep, QueueDType) || VN_IS(vdtypep, DynArrayDType)
+                || VN_IS(vdtypep, UnpackArrayDType)) {
                 if (times != 1)
                     nodep->v3warn(E_UNSUPPORTED, "Unsupported: Non-1 replication to form "
                                                      << vdtypep->prettyDTypeNameQ()
@@ -674,7 +686,7 @@ private:
                 VL_DO_DANGLING(pushDeletep(nodep), nodep);
                 return;
             }
-            if (VN_IS(vdtypep, AssocArrayDType) || VN_IS(vdtypep, UnpackArrayDType)) {
+            if (VN_IS(vdtypep, AssocArrayDType)) {
                 nodep->v3warn(E_UNSUPPORTED, "Unsupported: Replication to form "
                                                  << vdtypep->prettyDTypeNameQ() << " data type");
             }
@@ -6236,6 +6248,21 @@ private:
         return patmap;
     }
 
+    void patConcatConvertRecurse(AstPattern* patternp, AstConcat* nodep) {
+        if (AstConcat* lhsp = VN_CAST(nodep->lhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, lhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->lhsp()->fileline(),
+                                                 nodep->lhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+        if (AstConcat* rhsp = VN_CAST(nodep->rhsp(), Concat)) {
+            patConcatConvertRecurse(patternp, rhsp);
+        } else {
+            patternp->addItemsp(new AstPatMember{nodep->rhsp()->fileline(),
+                                                 nodep->rhsp()->unlinkFrBack(), nullptr, nullptr});
+        }
+    }
+
     void makeOpenArrayShell(AstNodeFTaskRef* nodep) {
         UINFO(4, "Replicate openarray function " << nodep->taskp() << endl);
         AstNodeFTask* const oldTaskp = nodep->taskp();
diff --git a/src/Verilator.cpp b/src/Verilator.cpp
index cabbb37b2..2d37511f1 100644
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@@ -237,7 +237,7 @@ static void process() {
         // Module inlining
         // Cannot remove dead variables after this, as alias information for final
         // V3Scope's V3LinkDot is in the AstVar.
-        if (v3Global.opt.oInline()) {
+        if (v3Global.opt.fInline()) {
             V3Inline::inlineAll(v3Global.rootp());
             V3LinkDot::linkDotArrayed(v3Global.rootp());  // Cleanup as made new modules
         }
@@ -308,11 +308,11 @@ static void process() {
         // Push constants across variables and remove redundant assignments
         V3Const::constifyAll(v3Global.rootp());
 
-        if (v3Global.opt.oLife()) V3Life::lifeAll(v3Global.rootp());
+        if (v3Global.opt.fLife()) V3Life::lifeAll(v3Global.rootp());
 
         // Make large low-fanin logic blocks into lookup tables
         // This should probably be done much later, once we have common logic elimination.
-        if (!v3Global.opt.lintOnly() && v3Global.opt.oTable()) {
+        if (!v3Global.opt.lintOnly() && v3Global.opt.fTable()) {
             V3Table::tableAll(v3Global.rootp());
         }
 
@@ -326,7 +326,7 @@ static void process() {
         V3Active::activeAll(v3Global.rootp());
 
         // Split single ALWAYS blocks into multiple blocks for better ordering chances
-        if (v3Global.opt.oSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
+        if (v3Global.opt.fSplit()) V3Split::splitAlwaysAll(v3Global.rootp());
         V3SplitAs::splitAsAll(v3Global.rootp());
 
         // Create tracing sample points, before we start eliminating signals
@@ -338,11 +338,11 @@ static void process() {
 
         // Gate-based logic elimination; eliminate signals and push constant across cell boundaries
         // Instant propagation makes lots-o-constant reduction possibilities.
-        if (v3Global.opt.oGate()) {
+        if (v3Global.opt.fGate()) {
             V3Gate::gateAll(v3Global.rootp());
             // V3Gate calls constant propagation itself.
         } else {
-            v3info("Command Line disabled gate optimization with -Og/-O0.  "
+            v3info("Command Line disabled gate optimization with -fno-gate.  "
                    "This may cause ordering problems.");
         }
 
@@ -361,7 +361,7 @@ static void process() {
         }
 
         // Reorder assignments in pipelined blocks
-        if (v3Global.opt.oReorder()) V3Split::splitReorderAll(v3Global.rootp());
+        if (v3Global.opt.fReorder()) V3Split::splitReorderAll(v3Global.rootp());
 
         // Create delayed assignments
         // This creates lots of duplicate ACTIVES so ActiveTop needs to be after this step
@@ -383,12 +383,12 @@ static void process() {
         // Cleanup any dly vars or other temps that are simple assignments
         // Life must be done before Subst, as it assumes each CFunc under
         // _eval is called only once.
-        if (v3Global.opt.oLife()) {
+        if (v3Global.opt.fLife()) {
             V3Const::constifyAll(v3Global.rootp());
             V3Life::lifeAll(v3Global.rootp());
         }
 
-        if (v3Global.opt.oLifePost()) V3LifePost::lifepostAll(v3Global.rootp());
+        if (v3Global.opt.fLifePost()) V3LifePost::lifepostAll(v3Global.rootp());
 
         // Remove unused vars
         V3Const::constifyAll(v3Global.rootp());
@@ -415,13 +415,13 @@ static void process() {
         v3Global.assertScoped(false);
 
         // Move variables from modules to function local variables where possible
-        if (v3Global.opt.oLocalize()) V3Localize::localizeAll(v3Global.rootp());
+        if (v3Global.opt.fLocalize()) V3Localize::localizeAll(v3Global.rootp());
 
         // Remove remaining scopes; make varrefs/funccalls relative to current module
         V3Descope::descopeAll(v3Global.rootp());
 
         // Icache packing; combine common code in each module's functions into subroutines
-        if (v3Global.opt.oCombine()) V3Combine::combineAll(v3Global.rootp());
+        if (v3Global.opt.fCombine()) V3Combine::combineAll(v3Global.rootp());
     }
 
     V3Error::abortIfErrors();
@@ -445,30 +445,30 @@ static void process() {
     }
 
     // Expand macros and wide operators into C++ primitives
-    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.oExpand()) {
+    if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly() && v3Global.opt.fExpand()) {
         V3Expand::expandAll(v3Global.rootp());
     }
 
     // Propagate constants across WORDSEL arrayed temporaries
-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubst()) {
         // Constant folding of expanded stuff
         V3Const::constifyCpp(v3Global.rootp());
         V3Subst::substituteAll(v3Global.rootp());
     }
 
-    if (!v3Global.opt.xmlOnly() && v3Global.opt.oSubstConst()) {
+    if (!v3Global.opt.xmlOnly() && v3Global.opt.fSubstConst()) {
         // Constant folding of substitutions
         V3Const::constifyCpp(v3Global.rootp());
         V3Dead::deadifyAll(v3Global.rootp());
     }
 
     if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly()) {
-        if (v3Global.opt.oMergeCond()) {
+        if (v3Global.opt.fMergeCond()) {
             // Merge conditionals
             V3MergeCond::mergeAll(v3Global.rootp());
         }
 
-        if (v3Global.opt.oReloop()) {
+        if (v3Global.opt.fReloop()) {
             // Reform loops to reduce code size
             // Must be after all Sel/array index based optimizations
             V3Reloop::reloopAll(v3Global.rootp());
diff --git a/test_regress/driver.pl b/test_regress/driver.pl
index ffcfac4a8..541fb296f 100755
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@@ -77,7 +77,6 @@ my $opt_gdbbt;
 my $opt_gdbsim;
 my $opt_hashset;
 my $opt_jobs = 1;
-my $opt_optimize;
 my $opt_quiet;
 my $opt_rerun;
 my $opt_rrsim;
@@ -104,7 +103,6 @@ if (! GetOptions(
           "hashset=s"   => \$opt_hashset,
           "help"        => \&usage,
           "j=i"         => \$opt_jobs,
-          "optimize:s"  => \$opt_optimize,
           "quiet!"      => \$opt_quiet,
           "rerun!"      => \$opt_rerun,
           "rr!"         => \$opt_rr,
@@ -661,7 +659,7 @@ sub new {
         verilator_define => 'VERILATOR',
         verilator_flags => ["-cc",
                             "-Mdir $self->{obj_dir}",
-                            "-OD",  # As currently disabled unless -O3
+                            "--fdedup",  # As currently disabled unless -O3
                             "--debug-check",
                             "--comp-limit-members 10", ],
         verilator_flags2 => [],
@@ -924,7 +922,6 @@ sub compile_vlt_flags {
     unshift @verilator_flags, "--trace" if $opt_trace;
     my $threads = ::calc_threads($Vltmt_threads);
     unshift @verilator_flags, "--threads $threads" if $param{vltmt} && $checkflags !~ /-threads /;
-    unshift @verilator_flags, "--trace-threads 1" if $param{vltmt} && $checkflags =~ /-trace /;
     unshift @verilator_flags, "--trace-threads 2" if $param{vltmt} && $checkflags =~ /-trace-fst /;
     unshift @verilator_flags, "--debug-partition" if $param{vltmt};
     unshift @verilator_flags, "-CFLAGS -ggdb -LDFLAGS -ggdb" if $opt_gdbsim;
@@ -935,19 +932,6 @@ sub compile_vlt_flags {
         $param{make_main} && $param{verilator_make_gmake};
     unshift @verilator_flags, "../" . $self->{main_filename} if
         $param{make_main} && $param{verilator_make_gmake};
-    if (defined $opt_optimize) {
-        my $letters = "";
-        if ($opt_optimize =~ /[a-zA-Z]/) {
-            $letters = $opt_optimize;
-        } else {  # Randomly turn on/off different optimizations
-            foreach my $l ('a' .. 'z') {
-                $letters .= ((rand() > 0.5) ? $l : uc $l);
-            }
-            unshift @verilator_flags, "--trace" if rand() > 0.5;
-            unshift @verilator_flags, "--coverage" if rand() > 0.5;
-        }
-        unshift @verilator_flags, "--O" . $letters;
-    }
 
     my @cmdargs = (
                    "--prefix " . $param{VM_PREFIX},
@@ -2907,11 +2891,6 @@ Displays this message and program version and exits.
 Run number of parallel tests, or 0 to determine the count based on the
 number of cores installed.  Requires Perl's Parallel::Forker package.
 
-=item --optimize
-
-Randomly turn on/off different optimizations.  With specific flags,
-use those optimization settings
-
 =item --quiet
 
 Suppress all output except for failures and progress messages every 15
diff --git a/test_regress/t/t_altera_lpm_mult_noinl.pl b/test_regress/t/t_altera_lpm_mult_noinl.pl
index 2eac39a3a..63f8aa315 100755
--- a/test_regress/t/t_altera_lpm_mult_noinl.pl
+++ b/test_regress/t/t_altera_lpm_mult_noinl.pl
@@ -15,7 +15,7 @@ top_filename("t/t_altera_lpm.v");
 $module =~ s/_noinl//;
 
 compile(
-    verilator_flags2 => ["--top-module ${module}", "-Oi"]
+    verilator_flags2 => ["--top-module ${module}", "-fno-inline"]
     );
 
 ok(1);
diff --git a/test_regress/t/t_alw_noreorder.pl b/test_regress/t/t_alw_noreorder.pl
index 46d021e6b..edc2a6f7b 100755
--- a/test_regress/t/t_alw_noreorder.pl
+++ b/test_regress/t/t_alw_noreorder.pl
@@ -12,7 +12,7 @@ scenarios(vlt_all => 1);
 
 top_filename("t/t_alw_reorder.v");
 compile(
-    verilator_flags2 => ["--stats -Or"],
+    verilator_flags2 => ["--stats -fno-reorder"],
     );
 
 file_grep($Self->{stats}, qr/Optimizations, Split always\s+(\d+)/i, 0);
diff --git a/test_regress/t/t_assign_inline.pl b/test_regress/t/t_assign_inline.pl
index 27414cae0..1683d1777 100755
--- a/test_regress/t/t_assign_inline.pl
+++ b/test_regress/t/t_assign_inline.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["-O0 -OG"],
+    verilator_flags2 => ["-O0 -fgate"],
     );
 
 execute(
diff --git a/test_regress/t/t_assign_slice_overflow_ox.pl b/test_regress/t/t_assign_slice_overflow_ox.pl
index 5251be495..8702b94fe 100755
--- a/test_regress/t/t_assign_slice_overflow_ox.pl
+++ b/test_regress/t/t_assign_slice_overflow_ox.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t_assign_slice_overflow.v");
 
 compile(
-    verilator_flags2 => ["-Ox"],
+    verilator_flags2 => ["-fno-expand"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_66bits_noexpand.pl b/test_regress/t/t_case_66bits_noexpand.pl
index fae2f640f..738da6174 100755
--- a/test_regress/t/t_case_66bits_noexpand.pl
+++ b/test_regress/t/t_case_66bits_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_66bits.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_case_incrdecr.pl b/test_regress/t/t_case_incrdecr.pl
index abbcf936a..729c0cc8a 100755
--- a/test_regress/t/t_case_incrdecr.pl
+++ b/test_regress/t/t_case_incrdecr.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--trace --Os -x-assign 0"],
+    verilator_flags2 => ["--trace --fno-split -x-assign 0"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_write1.pl b/test_regress/t/t_case_write1.pl
index 4fa36576d..33e2bb517 100755
--- a/test_regress/t/t_case_write1.pl
+++ b/test_regress/t/t_case_write1.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
     );
 
 execute(
diff --git a/test_regress/t/t_case_write1_noexpand.pl b/test_regress/t/t_case_write1_noexpand.pl
index cadb667e6..48c57c39a 100755
--- a/test_regress/t/t_case_write1_noexpand.pl
+++ b/test_regress/t/t_case_write1_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_case_write1.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_case_write2.pl b/test_regress/t/t_case_write2.pl
index 4fa36576d..33e2bb517 100755
--- a/test_regress/t/t_case_write2.pl
+++ b/test_regress/t/t_case_write2.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--stats --O3 -x-assign fast"],
+    verilator_flags2 => ["--stats -O3 -x-assign fast"],
     );
 
 execute(
diff --git a/test_regress/t/t_trace_c_api.pl b/test_regress/t/t_concat_unpack.pl
similarity index 52%
rename from test_regress/t/t_trace_c_api.pl
rename to test_regress/t/t_concat_unpack.pl
index 541970008..1aa73f80a 100755
--- a/test_regress/t/t_trace_c_api.pl
+++ b/test_regress/t/t_concat_unpack.pl
@@ -2,29 +2,20 @@
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
-# Copyright 2003-2013 by Wilson Snyder. This program is free software; you
+# Copyright 2022 by Wilson Snyder. This program is free software; you
 # can redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 # SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
 
-scenarios(vlt => 1);
+scenarios(simulator => 1);
 
 compile(
-    make_top_shell => 0,
-    make_main => 0,
-    v_flags2 => ["--trace --exe $Self->{t_dir}/t_trace_c_api.cpp",
-                 "-CFLAGS -DVERILATED_VCD_TEST",
-                 "-CFLAGS -DVL_TRACE_VCD_OLD_API"],
     );
 
 execute(
     check_finished => 1,
     );
 
-# vcddiff bug crashes
-#vcd_identical("$Self->{obj_dir}/simx.vcd",
-#              $Self->{golden_filename});
-
 ok(1);
 1;
diff --git a/test_regress/t/t_concat_unpack.v b/test_regress/t/t_concat_unpack.v
new file mode 100755
index 000000000..8d3f4bac2
--- /dev/null
+++ b/test_regress/t/t_concat_unpack.v
@@ -0,0 +1,36 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Wilson Snyder.
+// SPDX-License-Identifier: CC0-1.0
+
+module t(/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   wire [31:0] arr [0:7];
+   assign arr[0:7] = {
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000},
+                      {16'hffff, 16'h0000},
+                      {16'h0000, 16'h0000},
+                      {16'h0a0a, 16'h0000},
+                      {16'ha0a0, 16'h0000}
+                      };
+
+   int cyc = 0;
+
+   always @(posedge clk) begin
+      cyc <= cyc + 1;
+      if (cyc == 9) begin
+         if (arr[0] !== 32'hffff0000) $stop;
+         if (arr[7] !== 32'ha0a00000) $stop;
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+endmodule
diff --git a/test_regress/t/t_const_no_opt.pl b/test_regress/t/t_const_no_opt.pl
index 33be39810..79bc15076 100755
--- a/test_regress/t/t_const_no_opt.pl
+++ b/test_regress/t/t_const_no_opt.pl
@@ -13,7 +13,7 @@ top_filename("t/t_const_opt.v");
 
 # Run the same design as t_const_opt.pl without bitopt tree optimization to make sure that the result is same.
 compile(
-    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-Oo", "$Self->{t_dir}/t_const_opt.cpp"],
+    verilator_flags2 => ["-Wno-UNOPTTHREADS", "--stats", "-fno-const-bit-op-tree", "$Self->{t_dir}/t_const_opt.cpp"],
     );
 
 execute(
diff --git a/test_regress/t/t_const_opt.pl b/test_regress/t/t_const_opt.pl
index 26143eb57..83e301744 100755
--- a/test_regress/t/t_const_opt.pl
+++ b/test_regress/t/t_const_opt.pl
@@ -18,5 +18,8 @@ execute(
     check_finished => 1,
     );
 
+if ($Self->{vlt}) {
+    file_grep($Self->{stats}, qr/Optimizations, Const bit op reduction\s+(\d+)/i, 11);
+}
 ok(1);
 1;
diff --git a/test_regress/t/t_const_opt.v b/test_regress/t/t_const_opt.v
index be1e49c03..407fef13c 100644
--- a/test_regress/t/t_const_opt.v
+++ b/test_regress/t/t_const_opt.v
@@ -4,6 +4,11 @@
 // any use, without warranty, 2021 Yutetsu TAKATSUKASA.
 // SPDX-License-Identifier: CC0-1.0
 
+// This function always returns 0, so safe to take bitwise OR with any value.
+// Calling this function stops constant folding as Verialtor does not know
+// what this function returns.
+import "DPI-C" context function int fake_dependency();
+
 module t(/*AUTOARG*/
    // Inputs
    clk
@@ -57,7 +62,8 @@ module t(/*AUTOARG*/
          $write("[%0t] cyc==%0d crc=%x sum=%x\n", $time, cyc, crc, sum);
          if (crc !== 64'hc77bb9b3784ea091) $stop;
          // What checksum will we end up with (above print should match)
-`define EXPECTED_SUM 64'hcae926ece668f35d
+`define EXPECTED_SUM 64'hdccb9e7b8b638233
+
          if (sum !== `EXPECTED_SUM) $stop;
          $write("*-* All Finished *-*\n");
          $finish;
@@ -79,10 +85,11 @@ module Test(/*AUTOARG*/
    logic d0, d1, d2, d3, d4, d5, d6, d7;
    logic bug3182_out;
    logic bug3197_out;
+   logic bug3445_out;
 
    output logic o;
 
-   logic [6:0] tmp;
+   logic [7:0] tmp;
    assign o = ^tmp;
 
    always_ff @(posedge clk) begin
@@ -105,10 +112,12 @@ module Test(/*AUTOARG*/
       tmp[4] <= i[0] & (i[1] & (i[2] & (i[3] | d[4])));  // ConstBitOpTreeVisitor::m_frozenNodes
       tmp[5] <= bug3182_out;
       tmp[6] <= bug3197_out;
+      tmp[7] <= bug3445_out;
    end
 
    bug3182 i_bug3182(.in(d[4:0]), .out(bug3182_out));
    bug3197 i_bug3197(.clk(clk), .in(d), .out(bug3197_out));
+   bug3445 i_bug3445(.clk(clk), .in(d), .out(bug3445_out));
 
 endmodule
 
@@ -116,11 +125,6 @@ module bug3182(in, out);
    input wire [4:0] in;
    output wire out;
 
-   // This function always returns 0, so safe to take bitwise OR with any value.
-   // Calling this function stops constant folding as Verialtor does not know
-   // what this function returns.
-   import "DPI-C" context function int fake_dependency();
-
    logic [4:0] bit_source;
 
    /* verilator lint_off WIDTH */
@@ -140,3 +144,62 @@ module bug3197(input wire clk, input wire [31:0] in, output out);
    wire tmp0 = (|d[38:0]);
    assign out = (d[39] | tmp0);
 endmodule
+
+
+// Bug #3445
+// An unoptimized node is kept as frozen node, but its LSB and polarity were not saved.
+// AST of RHS of result0 looks as below:
+//   AND(SHIFTR(AND(WORDSEL(ARRAYSEL(VARREF)), WORDSEL(ARRAYSEL(VARREF)))), 32'd11)
+//                  ~~~~~~~~~~~~~~~~~~~~~~~~~  ~~~~~~~~~~~~~~~~~~~~~~~~~
+// Two of WORDSELs are frozen nodes. They are under SHIFTR of 11 bits.
+//
+// Fixing #3445 needs to
+//  1. Take AstShiftR and AstNot into op count when diciding optimizable or not
+//     (result0 and result2 in the test)
+//  2. Insert AstShiftR if LSB of the frozen node is not 0 (result1 in the test)
+//  3. Insert AstNot if polarity of the frozen node is false (resutl3 in the
+//  test)
+module bug3445(input wire clk, input wire [31:0] in, output wire out);
+   logic [127:0] d;
+   always_ff @(posedge clk)
+      d <= {d[95:0], in};
+
+   typedef struct packed {
+      logic        a;
+      logic [ 2:0] b;
+      logic [ 2:0] c;
+      logic [ 1:0] d;
+      logic [ 7:0] e;
+      logic [31:0] f;
+      logic [ 3:0] g;
+      logic [31:0] h;
+      logic        i;
+      logic [41:0] j;
+   } packed_struct;
+   packed_struct st[4];
+
+   // This is always 1'b0, but Verilator cannot notice it.
+   // This signal helps to reveal wrong optimization of result2 and result3.
+   logic zero;
+   always_ff @(posedge clk) begin
+      st[0] <= d;
+      st[1] <= st[0];
+      st[2] <= st[1];
+      st[3] <= st[2];
+      zero <= fake_dependency() > 0;
+   end
+
+   logic result0, result1, result2, result3;
+   always_ff @(posedge clk) begin
+      // Cannot optimize further.
+      result0 <= (st[0].g[0] & st[0].h[0]) & (in[0] == 1'b0);
+      // There are redundant !in[0] terms. They should be simplified.
+      result1 <= (!in[0] & (st[1].g[0] & st[1].h[0])) & ((in[0] == 1'b0) & !in[0]);
+      // Cannot optimize further.
+      result2 <= !(st[2].g[0] & st[2].h[0]) & (zero == 1'b0);
+      // There are redundant zero terms. They should be simplified.
+      result3 <= (!zero & !(st[3].g[0] & st[3].h[0])) & ((zero == 1'b0) & !zero);
+   end
+
+   assign out = result0 ^ result1 ^ (result2 | result3);
+endmodule
diff --git a/test_regress/t/t_emit_constw.pl b/test_regress/t/t_emit_constw.pl
index 9b1487fcd..8f7895804 100755
--- a/test_regress/t/t_emit_constw.pl
+++ b/test_regress/t/t_emit_constw.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ['--Ox'],
+    verilator_flags2 => ['--fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_extract_static_const_no_merge.pl b/test_regress/t/t_extract_static_const_no_merge.pl
index ff9a694d4..f656fe455 100755
--- a/test_regress/t/t_extract_static_const_no_merge.pl
+++ b/test_regress/t/t_extract_static_const_no_merge.pl
@@ -14,7 +14,7 @@ top_filename("t/t_extract_static_const.v");
 golden_filename("t/t_extract_static_const.out");
 
 compile(
-    verilator_flags2 => ["--stats", "--no-merge-const-pool"],
+    verilator_flags2 => ["--stats", "--fno-merge-const-pool"],
     );
 
 execute(
diff --git a/test_regress/t/t_func_twocall_noexpand.pl b/test_regress/t/t_func_twocall_noexpand.pl
index 001824bc6..452d4b37a 100755
--- a/test_regress/t/t_func_twocall_noexpand.pl
+++ b/test_regress/t/t_func_twocall_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_func_twocall.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_gen_genblk_noinl.pl b/test_regress/t/t_gen_genblk_noinl.pl
index 7574a1cfb..ef537cd4d 100755
--- a/test_regress/t/t_gen_genblk_noinl.pl
+++ b/test_regress/t/t_gen_genblk_noinl.pl
@@ -16,7 +16,7 @@ scenarios(simulator => 1);
 $Self->{sim_time} = 11000;
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_incr_void.pl b/test_regress/t/t_incr_void.pl
index 5b95e5b74..e7d3e18e3 100755
--- a/test_regress/t/t_incr_void.pl
+++ b/test_regress/t/t_incr_void.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(simulator => 1);
 
 compile(
-    verilator_flags2 => ["--Os -x-assign 0"],
+    verilator_flags2 => ["--fno-split -x-assign 0"],
     );
 
 execute(
diff --git a/test_regress/t/t_inst_slice_noinl.pl b/test_regress/t/t_inst_slice_noinl.pl
index 11f75c752..aa56e6155 100755
--- a/test_regress/t/t_inst_slice_noinl.pl
+++ b/test_regress/t/t_inst_slice_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_inst_slice.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface1_modport_noinl.pl b/test_regress/t/t_interface1_modport_noinl.pl
index b077bef4e..4f4b314ae 100755
--- a/test_regress/t/t_interface1_modport_noinl.pl
+++ b/test_regress/t/t_interface1_modport_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1_modport.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface1_noinl.pl b/test_regress/t/t_interface1_noinl.pl
index 3c9d8d316..867b1e993 100755
--- a/test_regress/t/t_interface1_noinl.pl
+++ b/test_regress/t/t_interface1_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface1.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface2_noinl.pl b/test_regress/t/t_interface2_noinl.pl
index 57b72e7a7..cad1b6e3d 100755
--- a/test_regress/t/t_interface2_noinl.pl
+++ b/test_regress/t/t_interface2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface2.v");
 
 compile(
-    verilator_flags2 => ["--top-module t -Oi"],
+    verilator_flags2 => ["--top-module t -fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_array2_noinl.pl b/test_regress/t/t_interface_array2_noinl.pl
index ad389d0fb..7bf1518f5 100755
--- a/test_regress/t/t_interface_array2_noinl.pl
+++ b/test_regress/t/t_interface_array2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_array_noinl.pl b/test_regress/t/t_interface_array_noinl.pl
index 02bf8fd89..df71f77e9 100755
--- a/test_regress/t/t_interface_array_noinl.pl
+++ b/test_regress/t/t_interface_array_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_array.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_down_noinl.pl b/test_regress/t/t_interface_down_noinl.pl
index fb03fc988..34ce5cb69 100755
--- a/test_regress/t/t_interface_down_noinl.pl
+++ b/test_regress/t/t_interface_down_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_down.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen10_noinl.pl b/test_regress/t/t_interface_gen10_noinl.pl
index e5c3f22c5..f691c6d0a 100755
--- a/test_regress/t/t_interface_gen10_noinl.pl
+++ b/test_regress/t/t_interface_gen10_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen10.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen11_noinl.pl b/test_regress/t/t_interface_gen11_noinl.pl
index 82a6a9a27..d1e7dd3c0 100755
--- a/test_regress/t/t_interface_gen11_noinl.pl
+++ b/test_regress/t/t_interface_gen11_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen11.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen12_noinl.pl b/test_regress/t/t_interface_gen12_noinl.pl
index c3f59ba19..8ebecd448 100755
--- a/test_regress/t/t_interface_gen12_noinl.pl
+++ b/test_regress/t/t_interface_gen12_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen12.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen2_noinl.pl b/test_regress/t/t_interface_gen2_noinl.pl
index fc7c4bfb1..eb772bab6 100755
--- a/test_regress/t/t_interface_gen2_noinl.pl
+++ b/test_regress/t/t_interface_gen2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen3_noinl.pl b/test_regress/t/t_interface_gen3_noinl.pl
index e49dfc39a..b63c72eb9 100755
--- a/test_regress/t/t_interface_gen3_noinl.pl
+++ b/test_regress/t/t_interface_gen3_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen3.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen4_noinl.pl b/test_regress/t/t_interface_gen4_noinl.pl
index 4a0b00930..e724c2859 100755
--- a/test_regress/t/t_interface_gen4_noinl.pl
+++ b/test_regress/t/t_interface_gen4_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen4.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen5_noinl.pl b/test_regress/t/t_interface_gen5_noinl.pl
index 0873ce9c5..5b4852691 100755
--- a/test_regress/t/t_interface_gen5_noinl.pl
+++ b/test_regress/t/t_interface_gen5_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen5.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen6_noinl.pl b/test_regress/t/t_interface_gen6_noinl.pl
index 4c42c6797..e43d9460a 100755
--- a/test_regress/t/t_interface_gen6_noinl.pl
+++ b/test_regress/t/t_interface_gen6_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen6.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen7_noinl.pl b/test_regress/t/t_interface_gen7_noinl.pl
index 27cb3ea61..458c5f0f6 100755
--- a/test_regress/t/t_interface_gen7_noinl.pl
+++ b/test_regress/t/t_interface_gen7_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen7.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen8_noinl.pl b/test_regress/t/t_interface_gen8_noinl.pl
index ba3b2b132..644d9a10e 100755
--- a/test_regress/t/t_interface_gen8_noinl.pl
+++ b/test_regress/t/t_interface_gen8_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen8.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen9_noinl.pl b/test_regress/t/t_interface_gen9_noinl.pl
index 48f4eb8be..6ac0d6296 100755
--- a/test_regress/t/t_interface_gen9_noinl.pl
+++ b/test_regress/t/t_interface_gen9_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen9.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_gen_noinl.pl b/test_regress/t/t_interface_gen_noinl.pl
index 5813d42eb..17273106f 100755
--- a/test_regress/t/t_interface_gen_noinl.pl
+++ b/test_regress/t/t_interface_gen_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_gen.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_inl.pl b/test_regress/t/t_interface_inl.pl
index efb67ed7f..08dfa385c 100755
--- a/test_regress/t/t_interface_inl.pl
+++ b/test_regress/t/t_interface_inl.pl
@@ -14,7 +14,7 @@ top_filename("t/t_interface.v");
 
 compile(
     # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_import_noinl.pl b/test_regress/t/t_interface_modport_import_noinl.pl
index 3821fef11..a9e97bee1 100755
--- a/test_regress/t/t_interface_modport_import_noinl.pl
+++ b/test_regress/t/t_interface_modport_import_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport_import.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_inl.pl b/test_regress/t/t_interface_modport_inl.pl
index 9afcd9cdd..eb2ca2181 100755
--- a/test_regress/t/t_interface_modport_inl.pl
+++ b/test_regress/t/t_interface_modport_inl.pl
@@ -14,7 +14,7 @@ top_filename("t/t_interface_modport.v");
 
 compile(
     # Avoid inlining so we find bugs in the non-inliner connection code
-    verilator_flags2 => ["-Oi"],
+    verilator_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_modport_noinl.pl b/test_regress/t/t_interface_modport_noinl.pl
index 4c051df1a..7f1015d23 100755
--- a/test_regress/t/t_interface_modport_noinl.pl
+++ b/test_regress/t/t_interface_modport_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_modport.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_mp_func_noinl.pl b/test_regress/t/t_interface_mp_func_noinl.pl
index 432a7308a..89f4835b5 100755
--- a/test_regress/t/t_interface_mp_func_noinl.pl
+++ b/test_regress/t/t_interface_mp_func_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_mp_func.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_nest_noinl.pl b/test_regress/t/t_interface_nest_noinl.pl
index 9d88a39a0..e042d33c1 100755
--- a/test_regress/t/t_interface_nest_noinl.pl
+++ b/test_regress/t/t_interface_nest_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_nest.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_noinl.pl b/test_regress/t/t_interface_noinl.pl
index 52cb09c98..7be6235ad 100755
--- a/test_regress/t/t_interface_noinl.pl
+++ b/test_regress/t/t_interface_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_interface_twod_noinl.pl b/test_regress/t/t_interface_twod_noinl.pl
index 18f0adf62..e77089cb0 100755
--- a/test_regress/t/t_interface_twod_noinl.pl
+++ b/test_regress/t/t_interface_twod_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_interface_twod.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_lint_setout_bad_noinl.pl b/test_regress/t/t_lint_setout_bad_noinl.pl
index 4b5131821..cbbf96bb4 100755
--- a/test_regress/t/t_lint_setout_bad_noinl.pl
+++ b/test_regress/t/t_lint_setout_bad_noinl.pl
@@ -13,7 +13,7 @@ scenarios(linter => 1);
 top_filename("t/t_lint_setout_bad.v");
 
 lint(
-    verilator_flags2 => ["--lint-only -Oi"],
+    verilator_flags2 => ["--lint-only -fno-inline"],
     fails => 1,
     expect_filename => $Self->{golden_filename},
     );
diff --git a/test_regress/t/t_math_cond_huge_noexpand.pl b/test_regress/t/t_math_cond_huge_noexpand.pl
index 0ae4e3ce4..15399cb9f 100755
--- a/test_regress/t/t_math_cond_huge_noexpand.pl
+++ b/test_regress/t/t_math_cond_huge_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_cond_huge.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_div_noexpand.pl b/test_regress/t/t_math_div_noexpand.pl
index 4dbcba15c..fa7ecd2ec 100755
--- a/test_regress/t/t_math_div_noexpand.pl
+++ b/test_regress/t/t_math_div_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_div.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_eq_noexpand.pl b/test_regress/t/t_math_eq_noexpand.pl
index f8b2375c0..2c3907b70 100755
--- a/test_regress/t/t_math_eq_noexpand.pl
+++ b/test_regress/t/t_math_eq_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_eq.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_red_noexpand.pl b/test_regress/t/t_math_red_noexpand.pl
index 89e54c0c9..655ce0246 100755
--- a/test_regress/t/t_math_red_noexpand.pl
+++ b/test_regress/t/t_math_red_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_red.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_shift_noexpand.pl b/test_regress/t/t_math_shift_noexpand.pl
index acf420f1a..e27343a72 100755
--- a/test_regress/t/t_math_shift_noexpand.pl
+++ b/test_regress/t/t_math_shift_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_shift.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_signed_noexpand.pl b/test_regress/t/t_math_signed_noexpand.pl
index 336d35594..b086af557 100755
--- a/test_regress/t/t_math_signed_noexpand.pl
+++ b/test_regress/t/t_math_signed_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_signed.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_math_vliw_noexpand.pl b/test_regress/t/t_math_vliw_noexpand.pl
index fce202e04..5ca1e425f 100755
--- a/test_regress/t/t_math_vliw_noexpand.pl
+++ b/test_regress/t/t_math_vliw_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_math_vliw.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_multi_io.pl b/test_regress/t/t_mem_multi_io.pl
index 1691d75f1..4e371f1d7 100755
--- a/test_regress/t/t_mem_multi_io.pl
+++ b/test_regress/t/t_mem_multi_io.pl
@@ -12,7 +12,7 @@ scenarios(simulator => 1);
 
 compile(
     # Disable inlining, this test is trivial without it
-    verilator_flags2 => ["-Oi --trace"],
+    verilator_flags2 => ["-fno-inline --trace"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io2_cc.pl b/test_regress/t/t_mem_multi_io2_cc.pl
index 3edda698b..bfd551aed 100755
--- a/test_regress/t/t_mem_multi_io2_cc.pl
+++ b/test_regress/t/t_mem_multi_io2_cc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io2_sc.pl b/test_regress/t/t_mem_multi_io2_sc.pl
index 11ae8cbfc..2fb4bf70c 100755
--- a/test_regress/t/t_mem_multi_io2_sc.pl
+++ b/test_regress/t/t_mem_multi_io2_sc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io2.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io2.cpp --sc -fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_multi_io3_cc.pl b/test_regress/t/t_mem_multi_io3_cc.pl
index 4ad019dbf..b6090a775 100755
--- a/test_regress/t/t_mem_multi_io3_cc.pl
+++ b/test_regress/t/t_mem_multi_io3_cc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multi_io3_sc.pl b/test_regress/t/t_mem_multi_io3_sc.pl
index 5825c7845..f37d9dedd 100755
--- a/test_regress/t/t_mem_multi_io3_sc.pl
+++ b/test_regress/t/t_mem_multi_io3_sc.pl
@@ -15,7 +15,7 @@ top_filename("t/t_mem_multi_io3.v");
 compile(
     make_top_shell => 0,
     make_main => 0,
-    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -Oi"],
+    verilator_flags2 => ["--exe $Self->{t_dir}/t_mem_multi_io3.cpp --sc -fno-inline"],
     verilator_flags3 => [],
     );
 
diff --git a/test_regress/t/t_mem_multidim_Ox.pl b/test_regress/t/t_mem_multidim_Ox.pl
index bb4dbc122..ccde0bbbd 100755
--- a/test_regress/t/t_mem_multidim_Ox.pl
+++ b/test_regress/t/t_mem_multidim_Ox.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mem_multidim.v");
 
 compile(
-    verilator_flags2 => ['--Ox'],
+    verilator_flags2 => ['--fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_mem_packed_noexpand.pl b/test_regress/t/t_mem_packed_noexpand.pl
index d5fc2b5da..df4c82d6d 100755
--- a/test_regress/t/t_mem_packed_noexpand.pl
+++ b/test_regress/t/t_mem_packed_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_mem_packed.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(
diff --git a/test_regress/t/t_merge_cond.pl b/test_regress/t/t_merge_cond.pl
index 51f97242d..971a808af 100755
--- a/test_regress/t/t_merge_cond.pl
+++ b/test_regress/t/t_merge_cond.pl
@@ -21,11 +21,11 @@ execute(
 if ($Self->{vlt}) {
     # Note, with vltmt this might be split differently, so only checking vlt
     file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i,
-              10);
+              9);
     file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i,
               580);
     file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i,
-              64);
+              128);
 }
 
 ok(1);
diff --git a/test_regress/t/t_merge_cond_blowup.pl b/test_regress/t/t_merge_cond_blowup.pl
new file mode 100755
index 000000000..aa9e8e1fe
--- /dev/null
+++ b/test_regress/t/t_merge_cond_blowup.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2022 by Geza Lore. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vlt => 1);
+
+# TODO: This takes excessively long on vltmt, this should be fixed
+
+compile(
+    verilator_flags2 => ["--unroll-count 1000000000", "--output-split 0", "--stats"],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+if ($Self->{vlt}) {
+    # Note, with vltmt this might be split differently, so only checking vlt
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond merges\s+(\d+)/i,
+              500);   # V3MergeCond.cpp MAX_DISTANCE
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond merged items\s+(\d+)/i,
+              1000);  # V3MergeCond.cpp MAX_DISTANCE *2
+    file_grep($Self->{stats}, qr/Optimizations, MergeCond longest merge\s+(\d+)/i,
+              2);
+}
+
+ok(1);
+1;
diff --git a/test_regress/t/t_merge_cond_blowup.v b/test_regress/t/t_merge_cond_blowup.v
new file mode 100644
index 000000000..aa97f8f26
--- /dev/null
+++ b/test_regress/t/t_merge_cond_blowup.v
@@ -0,0 +1,55 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2022 by Geza Lore.
+// SPDX-License-Identifier: CC0-1.0
+
+module t (/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   localparam int N = 4096;
+
+   integer cyc = 0;
+   reg [63:0] crc= 64'h5aef0c8d_d70a4497;
+
+   always @ (posedge clk) begin
+      cyc <= cyc + 1;
+      crc <= {crc[62:0], crc[63] ^ crc[2] ^ crc[0]};
+
+      if (cyc==99) begin
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+
+   reg a [N-1:0];
+   reg b [N-1:0];
+
+   // This yields pathological complexity for the current conditional merging
+   // algorithm. Note in practice, other parts of the compiler blow up on this
+   // code far earlier than the conditional merging, but here we go anyway.
+   generate
+      genvar i;
+      for (i = 0 ; i < N ; i = i + 1) begin
+        always @(posedge clk) a[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64];
+      end
+      for (i = 0 ; i < N ; i = i + 1) begin
+        always @(posedge clk) b[i] <= (crc + 64'(i)) == 0 ? crc[(i+16)%64] : crc[(i+32)%64];
+      end
+   endgenerate
+
+   always @(posedge clk) begin
+      if (cyc >= 2) begin
+        for (int i = 0 ; i < N ; i = i + 1) begin
+          if (a[i] !== b[i]) begin
+            $write("%%Error: %s:%0d: cyc=%0d i=%0d a[i]='h%x b[i]='h%x\n", `__FILE__,`__LINE__, cyc, i, a[i], b[i]);
+            $stop;
+          end
+        end
+      end
+   end
+
+endmodule
diff --git a/test_regress/t/t_mod_interface_array0_noinl.pl b/test_regress/t/t_mod_interface_array0_noinl.pl
index 3c74fd016..56032e0d9 100755
--- a/test_regress/t/t_mod_interface_array0_noinl.pl
+++ b/test_regress/t/t_mod_interface_array0_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array0.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array1_noinl.pl b/test_regress/t/t_mod_interface_array1_noinl.pl
index 34871282a..651bb1c65 100755
--- a/test_regress/t/t_mod_interface_array1_noinl.pl
+++ b/test_regress/t/t_mod_interface_array1_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array1.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array2_noinl.pl b/test_regress/t/t_mod_interface_array2_noinl.pl
index c19612e57..2afa9e020 100755
--- a/test_regress/t/t_mod_interface_array2_noinl.pl
+++ b/test_regress/t/t_mod_interface_array2_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array2.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array4_noinl.pl b/test_regress/t/t_mod_interface_array4_noinl.pl
index 6797c1016..62ad2ca24 100755
--- a/test_regress/t/t_mod_interface_array4_noinl.pl
+++ b/test_regress/t/t_mod_interface_array4_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array4.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_mod_interface_array6_noinl.pl b/test_regress/t/t_mod_interface_array6_noinl.pl
index 5244ac42c..f07ea1917 100755
--- a/test_regress/t/t_mod_interface_array6_noinl.pl
+++ b/test_regress/t/t_mod_interface_array6_noinl.pl
@@ -13,7 +13,7 @@ scenarios(simulator => 1);
 top_filename("t/t_mod_interface_array6.v");
 
 compile(
-    v_flags2 => ["-Oi"],
+    v_flags2 => ["-fno-inline"],
     );
 
 execute(
diff --git a/test_regress/t/t_optm_if_cond.pl b/test_regress/t/t_optm_if_cond.pl
index 91aa0aae5..3215fbf08 100755
--- a/test_regress/t/t_optm_if_cond.pl
+++ b/test_regress/t/t_optm_if_cond.pl
@@ -11,7 +11,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 scenarios(vlt => 1);
 
 compile(
-    verilator_flags2 => ['--stats', "-Ow"],
+    verilator_flags2 => ['--stats', "-fno-merge-cond"],
     );
 
 if ($Self->{vlt_all}) {
diff --git a/test_regress/t/t_trace_c_api.cpp b/test_regress/t/t_trace_c_api.cpp
deleted file mode 100644
index d2d3f0921..000000000
--- a/test_regress/t/t_trace_c_api.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// -*- mode: C++; c-file-style: "cc-mode" -*-
-//
-// DESCRIPTION: Verilator: Verilog Test module
-//
-// This file ONLY is placed under the Creative Commons Public Domain, for
-// any use, without warranty, 2008 by Wilson Snyder.
-// SPDX-License-Identifier: CC0-1.0
-
-#include <verilated.h>
-#include <verilated_vcd_c.h>
-
-#include VM_PREFIX_INCLUDE
-
-double sc_time_stamp() { return 0; }
-
-extern void vcdTestMain(const char* filenamep);
-
-int main(int argc, char** argv, char** env) {
-    const char* filenamep = VL_STRINGIFY(TEST_OBJ_DIR) "/simx.vcd";
-    printf("Writing %s\n", filenamep);
-    vcdTestMain(filenamep);
-    printf("*-* All Finished *-*\n");
-    return 0;
-}
diff --git a/test_regress/t/t_trace_c_api.v b/test_regress/t/t_trace_c_api.v
deleted file mode 100644
index 7b440cb91..000000000
--- a/test_regress/t/t_trace_c_api.v
+++ /dev/null
@@ -1,8 +0,0 @@
-// DESCRIPTION: Verilator: Verilog Test module
-//
-// This file ONLY is placed under the Creative Commons Public Domain, for
-// any use, without warranty, 2013 by Wilson Snyder.
-// SPDX-License-Identifier: CC0-1.0
-
-module t;
-endmodule
diff --git a/test_regress/t/t_trace_complex_old_api.pl b/test_regress/t/t_trace_complex_old_api.pl
deleted file mode 100755
index 8136d3f79..000000000
--- a/test_regress/t/t_trace_complex_old_api.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env perl
-if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
-# DESCRIPTION: Verilator: Verilog Test driver/expect definition
-#
-# Copyright 2003-2009 by Wilson Snyder. This program is free software; you
-# can redistribute it and/or modify it under the terms of either the GNU
-# Lesser General Public License Version 3 or the Perl Artistic License
-# Version 2.0.
-# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
-
-# Same test as t_trace_complex, but exercising the old VCD tracing API
-
-scenarios(vlt => 1);
-
-top_filename("t/t_trace_complex.v");
-golden_filename("t/t_trace_complex.out");
-
-compile(
-    verilator_flags2 => ['--cc --trace -CFLAGS -DVL_TRACE_VCD_OLD_API'],
-    );
-
-execute(
-    check_finished => 1,
-    );
-
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_strp_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_arrp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arrp_strp /);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arru\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_arrp\[/);
-file_grep("$Self->{obj_dir}/simx.vcd", qr/ v_arru_strp\[/);
-
-vcd_identical("$Self->{obj_dir}/simx.vcd", $Self->{golden_filename});
-
-ok(1);
-1;
diff --git a/test_regress/t/t_unpacked_concat_bad.out b/test_regress/t/t_unpacked_concat_bad.out
index 4c89adfe6..1482e7507 100644
--- a/test_regress/t/t_unpacked_concat_bad.out
+++ b/test_regress/t/t_unpacked_concat_bad.out
@@ -1,23 +1,6 @@
-%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type
+%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:17:46: Unsupported: Non-1 replication to form 'bit[31:0]$[1:0]' data type
                                                    : ... In instance t
    17 |    localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}};
       |                                              ^
                     ... For error description see https://verilator.org/warn/UNSUPPORTED?v=latest
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:17:47: Unsized numbers/parameters not allowed in replications.
-                                                     : ... In instance t
-   17 |    localparam bit_int_t count_bits [1:0] = {2{$bits(count_t)}};
-      |                                               ^~~~~
-                      ... Use "/* verilator lint_off WIDTHCONCAT */" and lint_on around source to disable this message.
-%Error-UNSUPPORTED: t/t_unpacked_concat_bad.v:18:45: Unsupported: Replication to form 'bit[31:0]$[1:0]' data type
-                                                   : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                             ^
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:46: Unsized numbers/parameters not allowed in concatenations.
-                                                     : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                              ^~~~~
-%Warning-WIDTHCONCAT: t/t_unpacked_concat_bad.v:18:60: Unsized numbers/parameters not allowed in replications.
-                                                     : ... In instance t
-   18 |    localparam bit_int_t count_bitsc [1:0] = {$bits(count_t), $bits(count_t)};
-      |                                                            ^
 %Error: Exiting due to
diff --git a/test_regress/t/t_var_assign_landr_noexpand.pl b/test_regress/t/t_var_assign_landr_noexpand.pl
index cd058334d..e616f77c3 100755
--- a/test_regress/t/t_var_assign_landr_noexpand.pl
+++ b/test_regress/t/t_var_assign_landr_noexpand.pl
@@ -13,7 +13,7 @@ scenarios(vlt => 1);
 top_filename("t/t_var_assign_landr.v");
 
 compile(
-    verilator_flags2 => ['-Ox'],
+    verilator_flags2 => ['-fno-expand'],
     );
 
 execute(