diff --git a/Changes b/Changes
index f3f35f0d8..ba1ca92e4 100644
--- a/Changes
+++ b/Changes
@@ -11,6 +11,7 @@ contributors that suggested a given feature are shown in []. Thanks!
 Verilator 4.213 devel
 ==========================
 
+* Add profile-guided optmization of mtasks (#3150).
 * Verilator_gantt has removed the ASCII graphics, use the VCD output instead.
 * Verilator_gantt now shows the predicted mtask times, eval times, and additional statistics.
 * Verilator_gantt data files now include processor information, to allow later processing.
diff --git a/bin/verilator b/bin/verilator
index 4554648b1..fbd06550a 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -447,6 +447,7 @@ description of these arguments.
      +verilator+prof+threads+file+<filename>  Set profile filename
      +verilator+prof+threads+start+<value>    Set profile starting point
      +verilator+prof+threads+window+<value>   Set profile duration
+     +verilator+prof+vlt+file+<filename>      Set profile guided filename
      +verilator+rand+reset+<value>     Set random reset technique
      +verilator+seed+<value>           Set random seed
      +verilator+V                      Verbose version and config
diff --git a/docs/guide/exe_sim.rst b/docs/guide/exe_sim.rst
index bd805aab8..ba49b2ca7 100644
--- a/docs/guide/exe_sim.rst
+++ b/docs/guide/exe_sim.rst
@@ -62,6 +62,12 @@ Summary:
    makes sense for a single-clock-domain module where it's typical to want
    to capture one posedge eval() and one negedge eval().
 
+.. option:: +verilator+prof+vlt+file+<filename>
+
+   When a model was Verilated using :vlopt:`--prof-threads`, sets the
+   profile-guided optimization data runtime filename to dump to.  Defaults
+   to :file:`profile.vlt`.
+
 .. option:: +verilator+rand+reset+<value>
 
    When a model was Verilated using :vlopt:`--x-initial unique
diff --git a/docs/guide/exe_verilator.rst b/docs/guide/exe_verilator.rst
index 922b39e39..8ed959e21 100644
--- a/docs/guide/exe_verilator.rst
+++ b/docs/guide/exe_verilator.rst
@@ -833,7 +833,7 @@ Summary:
 .. option:: --prof-threads
 
    Enable gantt chart data collection for threaded builds. See :ref:`Thread
-   Profiling`.
+   Profiling` and :ref:`Thread PGO`.
 
 .. option:: --protect-key <key>
 
@@ -1612,6 +1612,12 @@ The grammar of configuration commands is as follows:
    :option:`/*verilator&32;public_flat*/`, etc, metacomments. See
    e.g. :ref:`VPI Example`.
 
+.. option:: profile_data -mtask "<mtask_hash>" -cost <cost_value>
+
+   Feeds profile-guided optimization data into the Verilator algorithms in
+   order to improve model runtime performance.  This option is not expected
+   to be used by users directly.  See :ref:`Thread PGO`.
+
 .. option:: sc_bv -module "<modulename>" [-task "<taskname>"] -var "<signame>"
 
 .. option:: sc_bv -module "<modulename>" [-function "<funcname>"] -var "<signame>"
diff --git a/docs/guide/files.rst b/docs/guide/files.rst
index 8395e4aac..3f30fbb39 100644
--- a/docs/guide/files.rst
+++ b/docs/guide/files.rst
@@ -145,3 +145,23 @@ After running Make, the C++ compiler may produce the following:
      - Intermediate dependencies
    * - *{prefix}{misc}*\ .o
      - Intermediate objects
+
+The Verilated executable may produce the following:
+
+.. list-table::
+
+   * - coverage.dat
+     - Code coverage output, and default input filename for :command:`verilator_coverage`
+   * - gmon.out
+     - GCC/clang code profiler output, often fed into :command:`verilator_profcfunc`
+   * - profile.vlt
+     - -profile data file for :ref:`Thread PGO`
+   * - profile_threads.dat
+     - -profile-threads data file for :command:`verilator_gnatt`
+
+Verilator_gantt may produce the following:
+
+.. list-table::
+
+   * - profile_threads.vcd
+     - Gantt report waveform output
diff --git a/docs/guide/simulating.rst b/docs/guide/simulating.rst
index 24bb0251a..98114d886 100644
--- a/docs/guide/simulating.rst
+++ b/docs/guide/simulating.rst
@@ -26,7 +26,8 @@ risk of reset bugs in trade for performance; see the above documentation
 for these options.
 
 If using Verilated multithreaded, use ``numactl`` to ensure you are using
-non-conflicting hardware resources. See :ref:`Multithreading`.
+non-conflicting hardware resources. See :ref:`Multithreading`. Also
+consider using profile-guided optimization, see :ref:`Thread PGO`.
 
 Minor Verilog code changes can also give big wins.  You should not have any
 UNOPTFLAT warnings from Verilator.  Fixing these warnings can result in
@@ -93,9 +94,7 @@ cases, for example regressions, it is usually worth spending extra
 compilation time to reduce total CPU time.
 
 If you will be running many simulations on a single model, you can
-investigate profile guided optimization. With GCC, using GCC's
-"-fprofile-arcs", then GCC's "-fbranch-probabilities" will yield another
-15% or so.
+investigate profile guided optimization. See :ref:`Compiler PGO`.
 
 Modern compilers also support link-time optimization (LTO), which can help
 especially if you link in DPI code. To enable LTO on GCC, pass "-flto" in
@@ -298,6 +297,9 @@ With the :vlopt:`--prof-threads` option, Verilator will:
 * Add code to save profiling data in non-human-friendly form to the file
   specified with :vlopt:`+verilator+prof+threads+file+\<filename\>`.
 
+* Add code to save profiling data for thread profile-guided
+  optimization. See :ref:`Thread PGO`.
+
 The :command:`verilator_gantt` program may then be run to transform the
 saved profiling file into a nicer visual format and produce some related
 statistics.
@@ -314,6 +316,7 @@ statistics.
 
 For more information see :command:`verilator_gantt`.
 
+
 .. _Profiling ccache efficiency:
 
 Profiling ccache efficiency
@@ -377,3 +380,120 @@ For example:
          os >> main_time;
          os >> *topp;
      }
+
+
+Profile-Guided Optimization
+===========================
+
+Profile-guided optimization is the technique where profiling data is
+collected by running your simulation executable, then this information is
+used to guide the next Verilation or compilation.
+
+There are two forms of profile-guided optimizations.  Unfortunately for
+best results they must each be performed from the highest level code to the
+lowest, which means performing them separately and in this order:
+
+* :ref:`Thread PGO`
+* :ref:`Compiler PGO`
+
+Other forms of PGO may be supported in the future, such as clock and reset
+toggle rate PGO, branch prediction PGO, statement execution time PGO, or
+others as they prove beneficial.
+
+
+.. _Thread PGO:
+
+Thread Profile-Guided Optimization
+----------------------------------
+
+Verilator supports thread profile-guided optimization (Thread PGO) to
+improve multithreaded performance.
+
+When using multithreading, Verilator computes how long macro tasks take and
+tries to balance those across threads.  (What is a macro-task?  See the
+Verilator internals document (:file:`docs/internals.rst` in the
+distribution.)  If the estimations are incorrect, the threads will not be
+balanced, leading to decreased performance.  Thread PGO allows collecting
+profiling data to replace the estimates and better optimize these
+decisions.
+
+To use Thread PGO, Verilate the model with the :vlopt:`--prof-threads`
+option.
+
+Run the model executable. When the executable exits, it will create a
+profile.vlt file.
+
+Rerun Verilator, optionally omitting the :vlopt:`--prof-threads` option,
+and adding the profile.vlt generated earlier to the command line.
+
+Note there is no Verilator equivalent to GCC's --fprofile-use. Verilator's
+profile data file (profile.vlt) can be placed on the verilator command line
+directly without any prefix.
+
+If results from multiple simulations are to be used in generating the
+optimization, multiple simulation's profile.vlt may be concatenated
+externally, or each of the files may be fed as separate command line
+options into Verilator.  Verilator will simply sum the profile results, so
+a longer running test will have proportionally more weight for optimization
+than a shorter running test.
+
+If you provide any profile feedback data to Verilator, and it cannot use
+it, it will issue the :option:`PROFOUTOFDATE` warning that threads were
+scheduled using estimated costs.  This usually indicates that the profile
+data was generated from different Verilog source code than Verilator is
+currently running against. Therefore, repeat the data collection phase to
+create new profiling data, then rerun Verilator with the same input source
+files and that new profiling data.
+
+
+.. _Compiler PGO:
+
+Compiler Profile-Guided Optimization
+------------------------------------
+
+GCC and Clang support compiler profile-guided optimization (PGO). This
+optimizes any C/C++ program including Verilated code.  Using compiler PGO
+typically yields improvements of 5-15% on both single-threaded and
+multi-threaded models.
+
+To use compiler PGO with GCC or Clang, please see the appropriate compiler
+documentation.  The process in GCC 10 was as follows:
+
+1. Compile the Verilated model with the compiler's "-fprofile-generate"
+   flag:
+
+   .. code-block:: bash
+
+      verilator [whatever_flags] --make \
+          -CFLAGS -fprofile-generate -LDFLAGS -fprofile-generate
+
+   or, if calling make yourself, add -fprofile-generate appropriately to your
+   Makefile.
+
+2. Run your simulation. This will create \*.gcda file(s) in the same
+   directory as the source files.
+
+3. Recompile the model with -fprofile-use. The compiler will read the
+   \*.gcda file(s).
+
+   For GCC:
+
+   .. code-block:: bash
+
+      verilator [whatever_flags] --build \
+          -CFLAGS "-fprofile-use -fprofile-correction"
+
+   For Clang:
+
+   .. code-block:: bash
+
+      llvm-profdata merge -output default.profdata *.profraw
+      verilator [whatever_flags] --build \
+          -CFLAGS "-fprofile-use -fprofile-correction"
+
+   or, if calling make yourself, add these CFLAGS switches appropriately to
+   your Makefile.
+
+Clang and GCC also support -fauto-profile which uses sample-based
+feedback-directed optimization.  See the appropriate compiler
+documentation.
diff --git a/docs/guide/warnings.rst b/docs/guide/warnings.rst
index 3fab8ea75..abc4a1ca4 100644
--- a/docs/guide/warnings.rst
+++ b/docs/guide/warnings.rst
@@ -1011,6 +1011,22 @@ List Of Warnings
    a var/reg must be used as the target of procedural assignments.
 
 
+.. option:: PROFOUTOFDATE
+
+   Warns that threads were scheduled using estimated costs, despite the
+   fact that data was provided from profile-guided optimization (see
+   :ref:`Thread PGO`) as fed into Verilator using the
+   :option:`profile_data` configuration file option.  This usually
+   indicates that the profile data was generated from different Verilog
+   source code than Verilator is currently running against.
+
+   It is recommended to create new profiling data, then rerun Verilator
+   with the same input source files and that new profiling data.
+
+   Ignoring this warning may only slow simulations, it will simulate
+   correctly.
+
+
 .. option:: PROTECTED
 
    Warning that a 'pragma protected' section was encountered. The code
diff --git a/docs/internals.rst b/docs/internals.rst
index 39e3d3862..1b9b3e390 100644
--- a/docs/internals.rst
+++ b/docs/internals.rst
@@ -405,6 +405,9 @@ routines in the sources to rely more heavily on randomness, and
 generally try harder not to keep input nodes together when we have the
 option to scramble things.
 
+Profile-guided optimization make this a bit better, by adjusting mtask
+scheduling, but this does not yet guide the packing into mtasks.
+
 
 Performance Regression
 """"""""""""""""""""""
diff --git a/include/verilated.cpp b/include/verilated.cpp
index f86f04852..fb349c41f 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -2258,6 +2258,7 @@ VerilatedContext::VerilatedContext()
     Verilated::lastContextp(this);
     Verilated::threadContextp(this);
     m_ns.m_profThreadsFilename = "profile_threads.dat";
+    m_ns.m_profVltFilename = "profile.vlt";
     m_fdps.resize(31);
     std::fill(m_fdps.begin(), m_fdps.end(), static_cast<FILE*>(nullptr));
     m_fdFreeMct.resize(30);
@@ -2340,6 +2341,14 @@ std::string VerilatedContext::profThreadsFilename() const VL_MT_SAFE {
     const VerilatedLockGuard lock{m_mutex};
     return m_ns.m_profThreadsFilename;
 }
+void VerilatedContext::profVltFilename(const std::string& flag) VL_MT_SAFE {
+    const VerilatedLockGuard lock{m_mutex};
+    m_ns.m_profVltFilename = flag;
+}
+std::string VerilatedContext::profVltFilename() const VL_MT_SAFE {
+    const VerilatedLockGuard lock{m_mutex};
+    return m_ns.m_profVltFilename;
+}
 void VerilatedContext::randReset(int val) VL_MT_SAFE {
     const VerilatedLockGuard lock{m_mutex};
     m_s.m_randReset = val;
@@ -2495,6 +2504,8 @@ void VerilatedContextImp::commandArgVl(const std::string& arg) {
             profThreadsWindow(std::atol(value.c_str()));
         } else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value /*ref*/)) {
             profThreadsFilename(value);
+        } else if (commandArgVlValue(arg, "+verilator+prof+vlt+file+", value /*ref*/)) {
+            profVltFilename(value);
         } else if (commandArgVlValue(arg, "+verilator+rand+reset+", value /*ref*/)) {
             randReset(std::atoi(value.c_str()));
         } else if (commandArgVlValue(arg, "+verilator+seed+", value /*ref*/)) {
diff --git a/include/verilated.h b/include/verilated.h
index 1866e34fb..c6d6815c6 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -346,6 +346,7 @@ protected:
         vluint32_t m_profThreadsWindow = 2;  // +prof+threads window size
         // Slow path
         std::string m_profThreadsFilename;  // +prof+threads filename
+        std::string m_profVltFilename;  // +prof+vlt filename
     } m_ns;
 
     mutable VerilatedMutex m_argMutex;  // Protect m_argVec, m_argVecLoaded
@@ -522,6 +523,8 @@ public:  // But for internal use only
     vluint32_t profThreadsWindow() const VL_MT_SAFE { return m_ns.m_profThreadsWindow; }
     void profThreadsFilename(const std::string& flag) VL_MT_SAFE;
     std::string profThreadsFilename() const VL_MT_SAFE;
+    void profVltFilename(const std::string& flag) VL_MT_SAFE;
+    std::string profVltFilename() const VL_MT_SAFE;
 
     // Internal: Find scope
     const VerilatedScope* scopeFind(const char* namep) const VL_MT_SAFE;
diff --git a/include/verilated_profiler.h b/include/verilated_profiler.h
new file mode 100644
index 000000000..129d0870c
--- /dev/null
+++ b/include/verilated_profiler.h
@@ -0,0 +1,113 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// Code available from: https://verilator.org
+//
+// Copyright 2012-2021 by Wilson Snyder. This program is free software; you
+// can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+//
+//=============================================================================
+///
+/// \file
+/// \brief Verilated general profiling header
+///
+/// This file is not part of the Verilated public-facing API.
+/// It is only for internal use by Verilated library routines.
+///
+//=============================================================================
+
+#ifndef VERILATOR_VERILATED_PROFILER_H_
+#define VERILATOR_VERILATED_PROFILER_H_
+
+#include "verilatedos.h"
+#include "verilated.h"  // for VerilatedMutex and clang annotations
+
+// Profile record, private class used only by this header
+class VerilatedProfilerRec final {
+    std::string m_name;  // Hashed name of mtask/etc
+    size_t m_counterNumber = 0;  // Which counter has data
+public:
+    // METHODS
+    VerilatedProfilerRec(size_t counterNumber, const std::string& name)
+        : m_name{name}
+        , m_counterNumber{counterNumber} {}
+    VerilatedProfilerRec() = default;
+    size_t counterNumber() const { return m_counterNumber; }
+    std::string name() const { return m_name; }
+};
+
+// Create some number of bucketed profilers
+template <std::size_t T_Entries> class VerilatedProfiler final {
+    // Counters are stored packed, all together, versus in VerilatedProfilerRec to
+    // reduce cache effects
+    std::array<vluint64_t, T_Entries> m_counters{};  // Time spent on this record
+    std::deque<VerilatedProfilerRec> m_records;  // Record information
+
+public:
+    // METHODS
+    VerilatedProfiler() = default;
+    ~VerilatedProfiler() = default;
+    void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
+    void addCounter(size_t counter, const std::string& name) {
+        VL_DEBUG_IF(assert(counter < T_Entries););
+        m_records.emplace_back(VerilatedProfilerRec{counter, name});
+    }
+    void startCounter(size_t counter) {
+        vluint64_t val;
+        VL_RDTSC(val);
+        // -= so when we add end time in stopCounter, we already subtracted
+        // out, without needing to hold another temporary
+        m_counters[counter] -= val;
+    }
+    void stopCounter(size_t counter) {
+        vluint64_t val;
+        VL_RDTSC(val);
+        m_counters[counter] += val;
+    }
+};
+
+template <std::size_t T_Entries>
+void VerilatedProfiler<T_Entries>::write(const char* modelp,
+                                         const std::string& filename) VL_MT_SAFE {
+    static VerilatedMutex s_mutex;
+    const VerilatedLockGuard lock{s_mutex};
+
+    // On the first call we create the file.  On later calls we append.
+    // So when we have multiple models in an executable, possibly even
+    // running on different threads, each will have a different symtab so
+    // each will collect is own data correctly.  However when each is
+    // destroid we need to get all the data, not keep overwriting and only
+    // get the last model's data.
+    static bool s_firstCall = true;
+
+    VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););
+
+    FILE* fp = nullptr;
+    if (!s_firstCall) fp = std::fopen(filename.c_str(), "a");
+    if (VL_UNLIKELY(!fp))
+        fp = std::fopen(filename.c_str(), "w");  // firstCall, or doesn't exist yet
+    if (VL_UNLIKELY(!fp)) {
+        VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
+        // cppcheck-suppress resourceLeak   // bug, doesn't realize fp is nullptr
+        return;  // LCOV_EXCL_LINE
+    }
+    s_firstCall = false;
+
+    // TODO Perhaps merge with verilated_coverage output format, so can
+    // have a common merging and reporting tool, etc.
+    fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
+    fprintf(fp, "`verilator_config\n");
+
+    for (const auto& it : m_records) {
+        const std::string& name = it.name();
+        fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" VL_PRI64 "u\n", modelp,
+                name.c_str(), m_counters[it.counterNumber()]);
+    }
+
+    std::fclose(fp);
+}
+
+#endif
diff --git a/src/V3Config.cpp b/src/V3Config.cpp
index 6b511b6c7..93dd8a81f 100644
--- a/src/V3Config.cpp
+++ b/src/V3Config.cpp
@@ -24,6 +24,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
 
 //######################################################################
 // Resolve wildcards in files, modules, ftasks or variables
@@ -346,6 +347,9 @@ using V3ConfigFileResolver = V3ConfigWildcardResolver<V3ConfigFile>;
 class V3ConfigResolver final {
     V3ConfigModuleResolver m_modules;  // Access to module names (with wildcards)
     V3ConfigFileResolver m_files;  // Access to file names (with wildcards)
+    std::unordered_map<string, std::unordered_map<string, vluint64_t>>
+        m_profileData;  // Access to profile_data records
+    FileLine* m_profileFileLine = nullptr;
 
     static V3ConfigResolver s_singleton;  // Singleton (not via local static, as that's slow)
     V3ConfigResolver() = default;
@@ -356,6 +360,20 @@ public:
 
     V3ConfigModuleResolver& modules() { return m_modules; }
     V3ConfigFileResolver& files() { return m_files; }
+
+    void addProfileData(FileLine* fl, const string& model, const string& key, vluint64_t cost) {
+        if (!m_profileFileLine) m_profileFileLine = fl;
+        if (cost == 0) cost = 1;  // Cost 0 means delete (or no data)
+        m_profileData[model][key] += cost;
+    }
+    vluint64_t getProfileData(const string& model, const string& key) const {
+        const auto mit = m_profileData.find(model);
+        if (mit == m_profileData.cend()) return 0;
+        const auto it = mit->second.find(key);
+        if (it == mit->second.cend()) return 0;
+        return it->second;
+    }
+    FileLine* getProfileDataFileLine() const { return m_profileFileLine; }  // Maybe null
 };
 
 V3ConfigResolver V3ConfigResolver::s_singleton;
@@ -392,10 +410,6 @@ void V3Config::addIgnore(V3ErrorCode code, bool on, const string& filename, int
     }
 }
 
-void V3Config::addModulePragma(const string& module, AstPragmaType pragma) {
-    V3ConfigResolver::s().modules().at(module).addModulePragma(pragma);
-}
-
 void V3Config::addInline(FileLine* fl, const string& module, const string& ftask, bool on) {
     if (ftask.empty()) {
         V3ConfigResolver::s().modules().at(module).setInline(on);
@@ -408,6 +422,15 @@ void V3Config::addInline(FileLine* fl, const string& module, const string& ftask
     }
 }
 
+void V3Config::addModulePragma(const string& module, AstPragmaType pragma) {
+    V3ConfigResolver::s().modules().at(module).addModulePragma(pragma);
+}
+
+void V3Config::addProfileData(FileLine* fl, const string& model, const string& key,
+                              vluint64_t cost) {
+    V3ConfigResolver::s().addProfileData(fl, model, key, cost);
+}
+
 void V3Config::addVarAttr(FileLine* fl, const string& module, const string& ftask,
                           const string& var, AstAttrType attr, AstSenTree* sensep) {
     // Semantics: sensep only if public_flat_rw
@@ -497,6 +520,13 @@ void V3Config::applyVarAttr(AstNodeModule* modulep, AstNodeFTask* ftaskp, AstVar
     if (vp) vp->apply(varp);
 }
 
+vluint64_t V3Config::getProfileData(const string& model, const string& key) {
+    return V3ConfigResolver::s().getProfileData(model, key);
+}
+FileLine* V3Config::getProfileDataFileLine() {
+    return V3ConfigResolver::s().getProfileDataFileLine();
+}
+
 bool V3Config::waive(FileLine* filelinep, V3ErrorCode code, const string& message) {
     V3ConfigFile* filep = V3ConfigResolver::s().files().resolve(filelinep->filename());
     if (!filep) return false;
diff --git a/src/V3Config.h b/src/V3Config.h
index 470827fdf..2931b53e3 100644
--- a/src/V3Config.h
+++ b/src/V3Config.h
@@ -33,17 +33,23 @@ public:
     static void addCoverageBlockOff(const string& file, int lineno);
     static void addCoverageBlockOff(const string& module, const string& blockname);
     static void addIgnore(V3ErrorCode code, bool on, const string& filename, int min, int max);
-    static void addWaiver(V3ErrorCode code, const string& filename, const string& message);
-    static void addModulePragma(const string& module, AstPragmaType pragma);
     static void addInline(FileLine* fl, const string& module, const string& ftask, bool on);
+    static void addModulePragma(const string& module, AstPragmaType pragma);
+    static void addProfileData(FileLine* fl, const string& model, const string& key,
+                               vluint64_t cost);
+    static void addWaiver(V3ErrorCode code, const string& filename, const string& message);
     static void addVarAttr(FileLine* fl, const string& module, const string& ftask,
                            const string& signal, AstAttrType type, AstSenTree* nodep);
+
     static void applyCase(AstCase* nodep);
     static void applyCoverageBlock(AstNodeModule* modulep, AstBegin* nodep);
     static void applyIgnores(FileLine* filelinep);
     static void applyModule(AstNodeModule* modulep);
     static void applyFTask(AstNodeModule* modulep, AstNodeFTask* ftaskp);
     static void applyVarAttr(AstNodeModule* modulep, AstNodeFTask* ftaskp, AstVar* varp);
+
+    static uint64_t getProfileData(const string& model, const string& key);
+    static FileLine* getProfileDataFileLine();
     static bool waive(FileLine* filelinep, V3ErrorCode code, const string& message);
 };
 
diff --git a/src/V3EmitCSyms.cpp b/src/V3EmitCSyms.cpp
index 044332fcf..76e82adf0 100644
--- a/src/V3EmitCSyms.cpp
+++ b/src/V3EmitCSyms.cpp
@@ -21,6 +21,7 @@
 #include "V3EmitC.h"
 #include "V3EmitCBase.h"
 #include "V3LanguageWords.h"
+#include "V3PartitionGraph.h"
 
 #include <algorithm>
 #include <map>
@@ -394,6 +395,7 @@ void EmitCSyms::emitSymHdr() {
     if (v3Global.needTraceDumper()) {
         puts("#include \"" + v3Global.opt.traceSourceLang() + ".h\"\n");
     }
+    if (v3Global.opt.profThreads()) puts("#include \"verilated_profiler.h\"\n");
 
     puts("\n// INCLUDE MODEL CLASS\n");
     puts("\n#include \"" + topClassName() + ".h\"\n");
@@ -475,6 +477,21 @@ void EmitCSyms::emitSymHdr() {
         puts("];\n");
     }
 
+    if (v3Global.opt.profThreads()) {
+        puts("\n// PROFILING\n");
+        vluint64_t maxProfilerId = 0;
+        if (v3Global.opt.mtasks()) {
+            for (const V3GraphVertex* vxp
+                 = v3Global.rootp()->execGraphp()->depGraphp()->verticesBeginp();
+                 vxp; vxp = vxp->verticesNextp()) {
+                ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+                if (maxProfilerId < mtp->profilerId()) maxProfilerId = mtp->profilerId();
+            }
+        }
+        ++maxProfilerId;  // As size must include 0
+        puts("VerilatedProfiler<" + cvtToStr(maxProfilerId) + "> _vm_profiler;\n");
+    }
+
     if (!m_scopeNames.empty()) {  // Scope names
         puts("\n// SCOPE NAMES\n");
         for (const auto& itr : m_scopeNames) {
@@ -654,6 +671,7 @@ void EmitCSyms::emitSymImp() {
     }
 
     puts("// FUNCTIONS\n");
+
     // Destructor
     puts(symClassName() + "::~" + symClassName() + "()\n");
     puts("{\n");
@@ -663,7 +681,11 @@ void EmitCSyms::emitSymImp() {
         puts("if (__Vm_dumping) _traceDumpClose();\n");
         puts("#endif  // VM_TRACE\n");
     }
-    if (v3Global.opt.mtasks()) { puts("delete __Vm_threadPoolp;\n"); }
+    if (v3Global.opt.profThreads()) {
+        puts("_vm_profiler.write(\"" + topClassName()
+             + "\", _vm_contextp__->profVltFilename());\n");
+    }
+    if (v3Global.opt.mtasks()) puts("delete __Vm_threadPoolp;\n");
     puts("}\n\n");
 
     // Constructor
@@ -718,6 +740,19 @@ void EmitCSyms::emitSymImp() {
     }
     puts("{\n");
 
+    if (v3Global.opt.profThreads()) {
+        puts("// Configure profiling\n");
+        if (v3Global.opt.mtasks()) {
+            for (const V3GraphVertex* vxp
+                 = v3Global.rootp()->execGraphp()->depGraphp()->verticesBeginp();
+                 vxp; vxp = vxp->verticesNextp()) {
+                ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+                puts("_vm_profiler.addCounter(" + cvtToStr(mtp->profilerId()) + ", \""
+                     + mtp->hashName() + "\");\n");
+            }
+        }
+    }
+
     puts("// Configure time unit / time precision\n");
     if (!v3Global.rootp()->timeunit().isNone()) {
         puts("_vm_contextp__->timeunit(");
diff --git a/src/V3Error.h b/src/V3Error.h
index cac812106..aaa444cec 100644
--- a/src/V3Error.h
+++ b/src/V3Error.h
@@ -110,6 +110,7 @@ public:
         PINNOTFOUND,    // instance port name not found in it's module
         PKGNODECL,      // Error: Package/class needs to be predeclared
         PROCASSWIRE,    // Procedural assignment on wire
+        PROFOUTOFDATE,  // Profile data out of date
         PROTECTED,      // detected `pragma protected
         RANDC,          // Unsupported: 'randc' converted to 'rand'
         REALCVT,        // Real conversion
@@ -173,7 +174,7 @@ public:
             "LATCH", "LITENDIAN", "MODDUP",
             "MULTIDRIVEN", "MULTITOP","NOLATCH", "NULLPORT", "PINCONNECTEMPTY",
             "PINMISSING", "PINNOCONNECT",  "PINNOTFOUND", "PKGNODECL", "PROCASSWIRE",
-            "PROTECTED", "RANDC", "REALCVT", "REDEFMACRO",
+            "PROFOUTOFDATE", "PROTECTED", "RANDC", "REALCVT", "REDEFMACRO",
             "SELRANGE", "SHORTREAL", "SPLITVAR", "STMTDLY", "SYMRSVDWORD", "SYNCASYNCNET",
             "TICKCOUNT", "TIMESCALEMOD",
             "UNDRIVEN", "UNOPT", "UNOPTFLAT", "UNOPTTHREADS",
diff --git a/src/V3Hasher.cpp b/src/V3Hasher.cpp
index 48aaee870..259656989 100644
--- a/src/V3Hasher.cpp
+++ b/src/V3Hasher.cpp
@@ -455,6 +455,9 @@ private:
             iterateNull(nodep->ftaskp());
         });
     }
+    virtual void visit(AstMTaskBody* nodep) override {
+        m_hash += hashNodeAndIterate(nodep, HASH_DTYPE, HASH_CHILDREN, [=]() {});
+    }
     virtual void visit(AstNodeProcedure* nodep) override {
         m_hash += hashNodeAndIterate(nodep, HASH_DTYPE, HASH_CHILDREN, [=]() {});
     }
diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp
index d57339526..d9fc5f1d1 100644
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
@@ -18,6 +18,7 @@
 #include "verilatedos.h"
 
 #include "V3EmitCBase.h"
+#include "V3Config.h"
 #include "V3Os.h"
 #include "V3File.h"
 #include "V3GraphAlg.h"
@@ -27,6 +28,7 @@
 #include "V3PartitionGraph.h"
 #include "V3Scoreboard.h"
 #include "V3Stats.h"
+#include "V3UniqueNames.h"
 
 #include <list>
 #include <memory>
@@ -2615,15 +2617,152 @@ void V3Partition::go(V3Graph* mtasksp) {
     }
 }
 
+void add(std::unordered_map<int, vluint64_t>& cmap, int id, vluint64_t cost) { cmap[id] += cost; }
+
+using EstimateAndProfiled = std::pair<uint64_t, vluint64_t>;  // cost est, cost profiled
+using Costs = std::unordered_map<uint32_t, EstimateAndProfiled>;
+
+static void normalizeCosts(Costs& costs) {
+    const auto scaleCost = [](vluint64_t value, double multiplier) {
+        double scaled = static_cast<double>(value) * multiplier;
+        if (value && scaled < 1) scaled = 1;
+        return static_cast<uint64_t>(scaled);
+    };
+
+    // For all costs with a profile, compute sum
+    vluint64_t sumCostProfiled = 0;  // For data with estimate and profile
+    vluint64_t sumCostEstimate = 0;  // For data with estimate and profile
+    for (const auto& est : costs) {
+        if (est.second.second) {
+            sumCostEstimate += est.second.first;
+            sumCostProfiled += est.second.second;
+        }
+    }
+
+    if (sumCostEstimate) {
+        // For data where we don't have profiled data, compute how much to
+        // scale up/down the estimate to make on same relative scale as
+        // profiled data.  (Improves results if only a few profiles missing.)
+        double estToProfile
+            = static_cast<double>(sumCostProfiled) / static_cast<double>(sumCostEstimate);
+        UINFO(5, "Estimated data needs scaling by "
+                     << estToProfile << ", sumCostProfiled=" << sumCostProfiled
+                     << " sumCostEstimate=" << sumCostEstimate << endl);
+        for (auto& est : costs) {
+            uint64_t& costEstimate = est.second.first;
+            costEstimate = scaleCost(costEstimate, estToProfile);
+        }
+    }
+
+    // COSTS can overflow a uint32.  Using maximum value of costs, scale all down
+    vluint64_t maxCost = 0;
+    for (auto& est : costs) {
+        const uint64_t& costEstimate = est.second.first;
+        const uint64_t& costProfiled = est.second.second;
+        if (maxCost < costEstimate) maxCost = costEstimate;
+        if (maxCost < costProfiled) maxCost = costProfiled;
+        UINFO(9,
+              "Post uint scale: ce = " << est.second.first << " cp=" << est.second.second << endl);
+    }
+    vluint64_t scaleDownTo = 10000000;  // Extra room for future algorithms to add costs
+    if (maxCost > scaleDownTo) {
+        const double scaleup = static_cast<double>(scaleDownTo) / static_cast<double>(maxCost);
+        UINFO(5, "Scaling data to within 32-bits by multiply by=" << scaleup << ", maxCost="
+                                                                  << maxCost << endl);
+        for (auto& est : costs) {
+            est.second.first = scaleCost(est.second.first, scaleup);
+            est.second.second = scaleCost(est.second.second, scaleup);
+        }
+    }
+}
+
+void V3Partition::selfTestNormalizeCosts() {
+    {  // Test that omitted profile data correctly scales estimates
+        Costs costs({// id  est  prof
+                     {1, {10, 1000}},
+                     {2, {20, 0}},  // Note no profile
+                     {3, {30, 3000}}});
+        normalizeCosts(costs);
+        UASSERT_SELFTEST(uint64_t, costs[1].first, 1000);
+        UASSERT_SELFTEST(uint64_t, costs[1].second, 1000);
+        UASSERT_SELFTEST(uint64_t, costs[2].first, 2000);
+        UASSERT_SELFTEST(uint64_t, costs[2].second, 0);
+        UASSERT_SELFTEST(uint64_t, costs[3].first, 3000);
+        UASSERT_SELFTEST(uint64_t, costs[3].second, 3000);
+    }
+    {  // Test that very large profile data properly scales
+        Costs costs({// id  est  prof
+                     {1, {10, 100000000000}},
+                     {2, {20, 200000000000}},
+                     {3, {30, 1}}});  // Make sure doesn't underflow
+        normalizeCosts(costs);
+        UASSERT_SELFTEST(uint64_t, costs[1].first, 2500000);
+        UASSERT_SELFTEST(uint64_t, costs[1].second, 5000000);
+        UASSERT_SELFTEST(uint64_t, costs[2].first, 5000000);
+        UASSERT_SELFTEST(uint64_t, costs[2].second, 10000000);
+        UASSERT_SELFTEST(uint64_t, costs[3].first, 7500000);
+        UASSERT_SELFTEST(uint64_t, costs[3].second, 1);
+    }
+}
+
+static void fillinCosts(V3Graph* execMTaskGraphp) {
+    V3UniqueNames m_uniqueNames;  // For generating unique mtask profile hash names
+
+    // Pass 1: See what profiling data applies
+    Costs costs;  // For each mtask, costs
+
+    for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;
+         vxp = vxp->verticesNextp()) {
+        ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+        // Compute name of mtask, for hash lookup
+        mtp->hashName(m_uniqueNames.get(mtp->bodyp()));
+
+        // This estimate is 64 bits, but the final mtask graph algorithm needs 32 bits
+        vluint64_t costEstimate = V3InstrCount::count(mtp->bodyp(), false);
+        vluint64_t costProfiled = V3Config::getProfileData(v3Global.opt.prefix(), mtp->hashName());
+        if (costProfiled) {
+            UINFO(5, "Profile data for mtask " << mtp->id() << " " << mtp->hashName()
+                                               << " cost override " << costProfiled << endl);
+        }
+        costs[mtp->id()] = std::make_pair(costEstimate, costProfiled);
+    }
+
+    normalizeCosts(costs /*ref*/);
+
+    int totalEstimates = 0;
+    int missingProfiles = 0;
+    for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;
+         vxp = vxp->verticesNextp()) {
+        ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+        const uint32_t costEstimate = costs[mtp->id()].first;
+        const uint64_t costProfiled = costs[mtp->id()].second;
+        UINFO(9, "ce = " << costEstimate << " cp=" << costProfiled << endl);
+        UASSERT(costEstimate <= (1UL << 31), "cost scaling math would overflow uint32");
+        UASSERT(costProfiled <= (1UL << 31), "cost scaling math would overflow uint32");
+        const uint64_t costProfiled32 = static_cast<uint32_t>(costProfiled);
+        uint32_t costToUse = costProfiled32;
+        if (!costProfiled32) {
+            costToUse = costEstimate;
+            if (costEstimate != 0) ++missingProfiles;
+        }
+        if (costEstimate != 0) ++totalEstimates;
+        mtp->cost(costToUse);
+        mtp->priority(costToUse);
+    }
+
+    if (missingProfiles) {
+        if (FileLine* fl = V3Config::getProfileDataFileLine()) {
+            fl->v3warn(PROFOUTOFDATE, "Profile data for mtasks may be out of date. "
+                                          << missingProfiles << " of " << totalEstimates
+                                          << " mtasks had no data");
+        }
+    }
+}
+
 static void finalizeCosts(V3Graph* execMTaskGraphp) {
     GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE);
-
     while (const V3GraphVertex* vxp = ser.nextp()) {
         ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
-        uint32_t costCount = V3InstrCount::count(mtp->bodyp(), false);
-        mtp->cost(costCount);
-        mtp->priority(costCount);
-
         // "Priority" is the critical path from the start of the mtask, to
         // the end of the graph reachable from this mtask.  Given the
         // choice among several ready mtasks, we'll want to start the
@@ -2662,6 +2801,14 @@ static void finalizeCosts(V3Graph* execMTaskGraphp) {
         }
     }
 
+    // Assign profiler IDs
+    vluint64_t profilerId = 0;
+    for (const V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp;
+         vxp = vxp->verticesNextp()) {
+        ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+        mtp->profilerId(profilerId++);
+    }
+
     // Removing tasks may cause edges that were formerly non-transitive to
     // become transitive. Also we just created new edges around the removed
     // tasks, which could be transitive. Prune out all transitive edges.
@@ -2718,6 +2865,11 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
                    " " + cvtToStr(mtaskp->cost()) + ");\n" +  //
                    "}\n");
     }
+    if (v3Global.opt.profThreads()) {
+        // No lock around startCounter, as counter numbers are unique per thread
+        addStrStmt("vlSymsp->_vm_profiler.startCounter(" + cvtToStr(mtaskp->profilerId())
+                   + ");\n");
+    }
 
     //
     addStrStmt("Verilated::mtaskId(" + cvtToStr(mtaskp->id()) + ");\n");
@@ -2725,6 +2877,10 @@ static void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t th
     // Move the the actual body of calls to leaf functions into this function
     funcp->addStmtsp(mtaskp->bodyp()->unlinkFrBack());
 
+    if (v3Global.opt.profThreads()) {
+        // No lock around stopCounter, as counter numbers are unique per thread
+        addStrStmt("vlSymsp->_vm_profiler.stopCounter(" + cvtToStr(mtaskp->profilerId()) + ");\n");
+    }
     if (v3Global.opt.profThreads()) {
         addStrStmt("if (VL_UNLIKELY(" + recName + ")) "  //
                    + recName + "->endRecord(VL_RDTSC_Q());\n");
@@ -2851,9 +3007,10 @@ void V3Partition::finalize() {
     // V3LifePost) that can change the cost of logic within each mtask.
     // Now that logic is final, recompute the cost and priority of each
     // ExecMTask.
+    fillinCosts(execGraphp->mutableDepGraphp());
     finalizeCosts(execGraphp->mutableDepGraphp());
 
-    // Replace the graph body with it's multi-threaded implementation.
+    // Replace the graph body with its multi-threaded implementation.
     implementExecGraph(execGraphp);
 }
 
diff --git a/src/V3Partition.h b/src/V3Partition.h
index 0c9ca80d2..8d22d740a 100644
--- a/src/V3Partition.h
+++ b/src/V3Partition.h
@@ -50,6 +50,7 @@ public:
     void go(V3Graph* mtasksp);
 
     static void selfTest();
+    static void selfTestNormalizeCosts();
 
     // Print out a hash of the shape of graphp.  Only needed to debug the
     // origin of some nondeterminism; otherwise this is pretty useless.
diff --git a/src/V3PartitionGraph.h b/src/V3PartitionGraph.h
index 7b81cc737..d4518bb23 100644
--- a/src/V3PartitionGraph.h
+++ b/src/V3PartitionGraph.h
@@ -56,12 +56,14 @@ class ExecMTask final : public AbstractMTask {
 private:
     AstMTaskBody* const m_bodyp;  // Task body
     const uint32_t m_id;  // Unique id of this mtask.
+    string m_hashName;  // Hashed name for profile-driven optimization
     uint32_t m_priority = 0;  // Predicted critical path from the start of
                               // this mtask to the ends of the graph that are reachable from this
                               // mtask. In abstract time units.
     uint32_t m_cost = 0;  // Predicted runtime of this mtask, in the same
                           // abstract time units as priority().
     uint64_t m_predictStart = 0;  // Predicted start time of task
+    uint64_t m_profilerId = 0;  // VerilatedCounter number for profiling
     VL_UNCOPYABLE(ExecMTask);
 
 public:
@@ -77,11 +79,15 @@ public:
     void cost(uint32_t cost) { m_cost = cost; }
     void predictStart(vluint64_t time) { m_predictStart = time; }
     vluint64_t predictStart() const { return m_predictStart; }
+    void profilerId(vluint64_t id) { m_profilerId = id; }
+    vluint64_t profilerId() const { return m_profilerId; }
     string cFuncName() const {
         // If this MTask maps to a C function, this should be the name
         return string("__Vmtask") + "__" + cvtToStr(m_id);
     }
     virtual string name() const override { return string("mt") + cvtToStr(id()); }
+    string hashName() const { return m_hashName; }
+    void hashName(const string& name) { m_hashName = name; }
     void dump(std::ostream& str) const {
         str << name() << "." << cvtToHex(this);
         if (priority() || cost()) str << " [pr=" << priority() << " c=" << cvtToStr(cost()) << "]";
diff --git a/src/Verilator.cpp b/src/Verilator.cpp
index 28400d9e4..6bde19f70 100644
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@@ -590,6 +590,7 @@ static void verilate(const string& argString) {
         V3TSP::selfTest();
         V3ScoreboardBase::selfTest();
         V3Partition::selfTest();
+        V3Partition::selfTestNormalizeCosts();
         V3Broken::selfTest();
     }
 
diff --git a/src/verilog.l b/src/verilog.l
index 3c831abea..166d9b0f3 100644
--- a/src/verilog.l
+++ b/src/verilog.l
@@ -121,6 +121,7 @@ vnum    {vnum1}|{vnum2}|{vnum3}|{vnum4}|{vnum5}
   "no_clocker"          { FL; return yVLT_NO_CLOCKER; }
   "no_inline"           { FL; return yVLT_NO_INLINE; }
   "parallel_case"       { FL; return yVLT_PARALLEL_CASE; }
+  "profile_data"        { FL; return yVLT_PROFILE_DATA; }
   "public"              { FL; return yVLT_PUBLIC; }
   "public_flat"         { FL; return yVLT_PUBLIC_FLAT; }
   "public_flat_rd"      { FL; return yVLT_PUBLIC_FLAT_RD; }
@@ -133,12 +134,15 @@ vnum    {vnum1}|{vnum2}|{vnum3}|{vnum4}|{vnum5}
   "tracing_on"          { FL; return yVLT_TRACING_ON; }
 
   -?"-block"            { FL; return yVLT_D_BLOCK; }
+  -?"-cost"             { FL; return yVLT_D_COST; }
   -?"-file"             { FL; return yVLT_D_FILE; }
   -?"-function"         { FL; return yVLT_D_FUNCTION; }
   -?"-lines"            { FL; return yVLT_D_LINES; }
   -?"-match"            { FL; return yVLT_D_MATCH; }
+  -?"-model"            { FL; return yVLT_D_MODEL; }
   -?"-module"           { FL; return yVLT_D_MODULE; }
   -?"-msg"              { FL; return yVLT_D_MSG; }
+  -?"-mtask"            { FL; return yVLT_D_MTASK; }
   -?"-rule"             { FL; return yVLT_D_RULE; }
   -?"-task"             { FL; return yVLT_D_TASK; }
   -?"-var"              { FL; return yVLT_D_VAR; }
diff --git a/src/verilog.y b/src/verilog.y
index 55ae4f866..a64457f7d 100644
--- a/src/verilog.y
+++ b/src/verilog.y
@@ -363,6 +363,7 @@ BISONPRE_VERSION(3.7,%define api.header.include {"V3ParseBison.h"})
 %token<fl>              yVLT_NO_CLOCKER             "no_clocker"
 %token<fl>              yVLT_NO_INLINE              "no_inline"
 %token<fl>              yVLT_PARALLEL_CASE          "parallel_case"
+%token<fl>              yVLT_PROFILE_DATA           "profile_data"
 %token<fl>              yVLT_PUBLIC                 "public"
 %token<fl>              yVLT_PUBLIC_FLAT            "public_flat"
 %token<fl>              yVLT_PUBLIC_FLAT_RD         "public_flat_rd"
@@ -375,12 +376,15 @@ BISONPRE_VERSION(3.7,%define api.header.include {"V3ParseBison.h"})
 %token<fl>              yVLT_TRACING_ON             "tracing_on"
 
 %token<fl>              yVLT_D_BLOCK    "--block"
+%token<fl>              yVLT_D_COST     "--cost"
 %token<fl>              yVLT_D_FILE     "--file"
 %token<fl>              yVLT_D_FUNCTION "--function"
 %token<fl>              yVLT_D_LINES    "--lines"
-%token<fl>              yVLT_D_MODULE   "--module"
 %token<fl>              yVLT_D_MATCH    "--match"
+%token<fl>              yVLT_D_MODEL    "--model"
+%token<fl>              yVLT_D_MODULE   "--module"
 %token<fl>              yVLT_D_MSG      "--msg"
+%token<fl>              yVLT_D_MTASK    "--mtask"
 %token<fl>              yVLT_D_RULE     "--rule"
 %token<fl>              yVLT_D_TASK     "--task"
 %token<fl>              yVLT_D_VAR      "--var"
@@ -6404,6 +6408,8 @@ vltItem:
 			{ V3Config::addCaseParallel(*$3, 0); }
 	|	yVLT_PARALLEL_CASE yVLT_D_FILE yaSTRING yVLT_D_LINES yaINTNUM
 			{ V3Config::addCaseParallel(*$3, $5->toUInt()); }
+	|	yVLT_PROFILE_DATA yVLT_D_MODEL yaSTRING yVLT_D_MTASK yaSTRING yVLT_D_COST yaINTNUM
+			{ V3Config::addProfileData($<fl>1, *$3, *$5, $7->toUQuad()); }
 	;
 
 vltOffFront<errcodeen>:
diff --git a/test_regress/t/t_gantt.pl b/test_regress/t/t_gantt.pl
index 04727b3ef..218a86243 100755
--- a/test_regress/t/t_gantt.pl
+++ b/test_regress/t/t_gantt.pl
@@ -27,6 +27,7 @@ execute(
     all_run_flags => ["+verilator+prof+threads+start+2",
                       " +verilator+prof+threads+window+2",
                       " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
                       ],
     check_finished => 1,
     );
diff --git a/test_regress/t/t_gate_tree.pl b/test_regress/t/t_gate_tree.pl
index 77115ef8f..67587a1ba 100755
--- a/test_regress/t/t_gate_tree.pl
+++ b/test_regress/t/t_gate_tree.pl
@@ -121,6 +121,7 @@ execute(
     all_run_flags => ["+verilator+prof+threads+start+100",
                       " +verilator+prof+threads+window+2",
                       " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
                       ],
     check_finished => 1,
     );
diff --git a/test_regress/t/t_pgo_profoutofdate_bad.out b/test_regress/t/t_pgo_profoutofdate_bad.out
new file mode 100644
index 000000000..25f1c8787
--- /dev/null
+++ b/test_regress/t/t_pgo_profoutofdate_bad.out
@@ -0,0 +1,6 @@
+%Warning-PROFOUTOFDATE: t/t_pgo_profoutofdate_bad.v:27:1: Profile data for mtasks may be out of date. 3 of 3 mtasks had no data
+   27 | profile_data -model "x" -mtask "h7baded98__0" -cost 64'd12345678901234567890
+      | ^~~~~~~~~~~~
+                        ... For warning description see https://verilator.org/warn/PROFOUTOFDATE?v=latest
+                        ... Use "/* verilator lint_off PROFOUTOFDATE */" and lint_on around source to disable this message.
+%Error: Exiting due to
diff --git a/test_regress/t/t_pgo_profoutofdate_bad.pl b/test_regress/t/t_pgo_profoutofdate_bad.pl
new file mode 100755
index 000000000..e2cfc96a1
--- /dev/null
+++ b/test_regress/t/t_pgo_profoutofdate_bad.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vltmt => 1);
+
+compile(
+    v_flags2 => ["--threads 2"],
+    fails => 1,
+    expect_filename => $Self->{golden_filename},
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_pgo_profoutofdate_bad.v b/test_regress/t/t_pgo_profoutofdate_bad.v
new file mode 100755
index 000000000..cba43da97
--- /dev/null
+++ b/test_regress/t/t_pgo_profoutofdate_bad.v
@@ -0,0 +1,28 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed under the Creative Commons Public Domain, for
+// any use, without warranty, 2021 by Wilson Snyder.
+// SPDX-License-Identifier: CC0-1.0
+
+module t(/*AUTOARG*/
+   // Inputs
+   clk
+   );
+   input clk;
+
+   integer cyc=0;
+
+   // Test loop
+   always @ (posedge clk) begin
+      cyc <= cyc + 1;
+      if (cyc == 99) begin
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+   end
+
+endmodule
+
+`verilator_config
+profile_data -model "x" -mtask "h7baded98__0" -cost 64'd12345678901234567890
+profile_data -model "x" -mtask "hb56134bd__0" -cost 945
diff --git a/test_regress/t/t_pgo_threads.pl b/test_regress/t/t_pgo_threads.pl
new file mode 100755
index 000000000..1bcccc905
--- /dev/null
+++ b/test_regress/t/t_pgo_threads.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you
+# can redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
+
+scenarios(vltmt => 1);
+
+# It doesn't really matter what test
+top_filename("t/t_gen_alw.v");
+
+compile(
+    v_flags2 => ["--prof-threads --threads 2"]
+    );
+
+execute(
+    all_run_flags => ["+verilator+prof+threads+start+0",
+                      " +verilator+prof+threads+file+/dev/null",
+                      " +verilator+prof+vlt+file+$Self->{obj_dir}/profile.vlt",
+                      ],
+    check_finished => 1,
+    );
+
+file_grep("$Self->{obj_dir}/profile.vlt", qr/profile_data/i);
+
+compile(
+    # Intentinally no --prof-threads here, so we make sure profile data
+    # can read in without it (that is no prof-thread effect on profile_data hash names)
+    v_flags2 => ["--threads 2",
+                 " $Self->{obj_dir}/profile.vlt"],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_verilated_all.pl b/test_regress/t/t_verilated_all.pl
index cc0caa085..14ef03202 100755
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@@ -21,10 +21,15 @@ compile(
                           ? "--threads 2 $root/include/verilated_threads.cpp" : ""),
                          ($Self->cfg_with_threaded
                           ? "--trace-threads 1" : ""),
+                         ($Self->cfg_with_threaded
+                          ? "--prof-threads" : ""),
                          "$root/include/verilated_save.cpp"],
     );
 
 execute(
+    all_run_flags => [" +verilator+prof+threads+file+/dev/null",
+                      " +verilator+prof+vlt+file+/dev/null",
+                      ],
     check_finished => 1,
     );
 
diff --git a/test_regress/t/t_verilated_all_newest.pl b/test_regress/t/t_verilated_all_newest.pl
index 1df755ae6..08b1f3838 100755
--- a/test_regress/t/t_verilated_all_newest.pl
+++ b/test_regress/t/t_verilated_all_newest.pl
@@ -21,6 +21,9 @@ compile(
     );
 
 execute(
+    all_run_flags => [" +verilator+prof+threads+file+/dev/null",
+                      " +verilator+prof+vlt+file+/dev/null",
+                      ],
     check_finished => 1,
     );