diff --git a/Changes b/Changes
index 14b034de7..72f687a4d 100644
--- a/Changes
+++ b/Changes
@@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
 * Verilator 4.000 devel
 
 **    This is a major release.  Any patches may require major rework to apply.
+      [Thanks everyone]
+
+**    Add multithreaded model generation.
 
 **    Add runtime arguments.
 
-**    Fix internals to be C++ null-pointer-check clean.
-
 ***   Better optimize large always block splitting, bug1244. [John Coiner]
 
 ***   Add new reloop optimization for repetitive assignment compression.
 
+****  Fix internals to be C++ null-pointer-check clean.
+
 ****  Fix internals to avoid 'using namespace std'.
 
 ****  Fix Verilation performance issues, bug1316. [John Coiner]
diff --git a/Makefile.in b/Makefile.in
index c864db313..c087a4736 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 	bin/verilator \
 	bin/verilator_coverage \
 	bin/verilator_difftree \
+	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	doxygen-mainpage doxygen.config veripool-logo.png \
@@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 INST_PROJ_FILES = \
 	bin/verilator \
 	bin/verilator_coverage \
+	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	include/verilated.mk \
@@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile
 
 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
 VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
-	verilator_coverage verilator_includer verilator_profcfunc
+	verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
 # Some scripts go into both the search path and pkgdatadir,
 # so they can be found by the user, and under $VERILATOR_ROOT.
 
 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
-VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
+VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1
 
 VL_INST_INC_BLDDIR_FILES = \
 	include/verilated_config.h \
@@ -295,6 +297,7 @@ installbin:
 	$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
+	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
diff --git a/bin/verilator b/bin/verilator
index 4f6516c2d..2a6a45cc2 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
     --pipe-filter <command>     Filter all input through a script
     --prefix <topname>          Name of top level class
     --prof-cfuncs               Name functions for profiling
+    --prof-threads              Enable generating gantt chart data for threads
     --private                   Debugging; see docs
     --public                    Debugging; see docs
      -pvalue+<name>=<value>     Overwrite toplevel parameter
@@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
     --stats-vars                Provide statistics on variables
      -sv                        Enable SystemVerilog parsing
      +systemverilogext+<ext>    Synonym for +1800-2017ext+<ext>
+    --threads <threads>         Enable multithreading
+    --threads-dpi <mode>        Enable multithreaded DPI
+    --threads-max-mtasks <mtasks>  Tune maximum mtask partitioning
     --top-module <topname>      Name of top level input module
     --trace                     Enable waveform creation
     --trace-depth <levels>      Depth of tracing
@@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
      +verilator+debug                  Enable debugging
      +verilator+debugi+<value>         Enable debugging at a level
      +verilator+help                   Display help
+     +verilator+prof+threads+file+I<filename>  Set profile filename
+     +verilator+prof+threads+start+I<value>    Set profile starting point
+     +verilator+prof+threads+window+I<value>   Set profile duration
      +verilator+rand+reset+<value>     Set random reset technique
      +verilator+V                      Verbose version and config
      +verilator+version                Show version and exit
@@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from.  This allows gprof
 or oprofile reports to be correlated with the original Verilog source
 statements. See also L<verilator_profcfunc>.
 
+=item --prof-threads
+
+Enable gantt chart data collection for threaded builds.
+
+Verilator will record the start and end time of each macro-task across a
+number of calls to eval. (What is a macro-task? See the Verilator internals
+document.)
+
+When profiling is enabled, the runtime will emit a blurb of profiling data
+in non-human-friendly form. The C<verilator_gantt> script will transform
+this into a nicer visual format and produce some related statistics.
+
 =item --private
 
 Opposite of --public.  Is the default; this option exists for backwards
@@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.
 
 The user code must create a VerilatedSerialize or VerilatedDeserialze
 object then calling the << or >> operators on the generated model and any
-other data the process needs saved/restored.  For example:
+other data the process needs saved/restored.  These functions are not
+thread safe, and are typically called only by a main thread.
+
+For example:
 
     void save_model(const char* filenamep) {
         VerilatedSave os;
@@ -1173,6 +1195,42 @@ compatibility with other simulators.
 
 A synonym for C<+1800-2017ext+>I<ext>.
 
+=item --threads I<threads>
+
+=item --no-threads
+
+With --threads 0 or --no-threads, the default, the generated model is not
+thread safe. With --threads 1, the generated model is single threaded but
+may run in a multithreaded environment. With --threads N, where N >= 2, the
+model is generated to run multithreaded on up to N threads. See
+L</"MULTITHREADING">.
+
+=item --threads-dpi all
+
+=item --threads-dpi none
+
+=item --threads-dpi pure
+
+When using --dpi with --threads, control what DPI tasks are thread safe.
+
+With --threads-dpi all, enable Verilator to assume all DPI imports are
+threadsafe, and to use thread-local storage for communication with DPI,
+potentially improving performance. Any DPI libraries need appropriate
+mutexes to avoid undefined behavior.
+
+With --threads-dpi none, Verilator assume DPI imports are not thread safe,
+and Verilator will serialize calls to DPI imports by default, potentially
+harming performance.
+
+With --threads-dpi pure, the default, Verilator assumes DPI pure imports
+are threadsafe, but non-pure DPI imports are not.
+
+=item --threads-max-mtasks I<value>
+
+Rarely needed.  When using --threads, specify the number of mtasks the
+model is to be partitioned into. If unspecified, Verilator approximates a
+good value.
+
 =item --top-module I<topname>
 
 When the input Verilog contains more than one top level module, specifies
@@ -1464,6 +1522,28 @@ Enable debugging at the provided level.
 
 Display help and exit.
 
+=item +verilator+prof+threads+file+I<filename>
+
+When using --prof-threads, the filename to dump to.  Defaults to
+"profile_threads.dat".
+
+=item +verilator+prof+threads+start+I<value>
+
+When using --prof-threads, Verilator will wait until $time is at this
+value, then start the profiling warmup, then capturing. Generally this
+should be set to some time that is well within the normal operation of the
+simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
+1.
+
+=item +verilator+prof+threads+window+I<value>
+
+When using --prof-threads, after $time reaches
++verilator+prof+threads+start, Verilator will warm up the profiling for
+this number of eval() calls, then will capture the profiling of this number
+of eval() calls.  Defaults to 2, which makes sense for a
+single-clock-domain module where it's typical to want to capture one
+posedge eval() and one negedge eval().
+
 =item +verilator+rand+reset+I<value>
 
 When a model was Verilated using "-x-inital unique", sets the
@@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
 of reset bugs in trade for performance; see the above documentation for
 these flags.
 
+If using Verilated multithreaded, use C<numactl> to ensure you are using
+non-conflicting hardware resources. See L</"MULTITHREADING">.
+
 Minor Verilog code changes can also give big wins.  You should not have any
 UNOPTFLAT warnings from Verilator.  Fixing these warnings can result in
 huge improvements; one user fixed their one UNOPTFLAT warning by making a
@@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
 in obj_dir/Vour_classes.mk.
 
 
+=head1 MULTITHREADING
+
+Verilator experimentally supports multithreading.
+
+With --no-threads, the default, the model is not thread safe, and any use
+of more than one thread calling into one or even different Verilated models
+may result in unpredictable behavior. This gives the highest single thread
+performance.
+
+With --threads 1, the generated model is single threaded, however the
+support libraries are multithread safe. This allows different
+instantiations of model(s) to potentially each be run under a different
+thread.  All threading is the responsibility of the user's C++ testbench.
+
+With --threads N, where N is at least 2, the generated model will be
+designed to run in parallel on N threads. The thread calling eval()
+provides one of those threads, and the generated model will create and
+manage the other N-1 threads. It's the client's responsibility not to
+oversubscribe the available CPU cores. Under CPU oversubscription, the
+Verilated model should not livelock nor deadlock, however, you can expect
+performance to be far worse than it would be with proper stoichiometry of
+threads and CPU cores.
+
+The remainder of this section describe behavior with --threads 1 or
+--threads N (not --no-threads).
+
+VL_THREADED is defined when compiling a threaded Verilated module, causing
+the Verilated support classes become threadsafe.
+
+The thread used for constructing a model must the the same thread that
+calls eval() into the model, this is called the "eval thread". The thread
+used to perform certain global operations such as saving and tracing must
+be done by a "main thread". In most cases the eval thread and main thread
+are the same thread (i.e. the user's top C++ testbench runs on a single
+thread), but this is not required.
+
+When running a multithreaded model, the default Linux task scheduler often
+works against the model, by assuming threads are short lived, and thus
+often schedules threads using multiple hyperthreads within the same
+physical core. For best performance use the C<numactl> program to (when the
+threading count fits) select unique physical cores on the same socket. For
+example, if a model was Verilated with "--threads 4", we consult
+
+   egrep 'processor|physical id|core id' /proc/cpuinfo
+
+To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
+but different physical cores.  (Also useful is "numactl --hardware", or
+C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
+
+   numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
+
+This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
+(presumably on socket 0) optimizing performance.  Of course this must be
+adjusted if you want another simulator using e.g. socket 1, or if you
+Verilated with a different number of threads.  To see what CPUs are
+actually used, use --prof-threads.
+
+=head2 Multithreaded Verilog and Library Support
+
+$display/$stop/$finish are delayed until the end of an eval() call in order
+to maintain ordering between threads. This may result in additional tasks
+completing after the $stop or $finish.
+
+If using --coverage, the coverage routines are fully thread safe.
+
+If using --dpi, Verilator assumes pure DPI imports are thread safe,
+balancing performance versus saftey. See --threads-dpi.
+
+If using --savable, the save/restore classes are not multithreaded and are
+must be called only by the eval thread.
+
+If using --sc, the SystemC kernel is not thread safe, therefore the eval
+thread and main thread must be the same.
+
+If using --trace, the tracing classes must be constructed and called from
+the main thread.
+
+If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
+multithreaded, Verilator requires all VPI calls are only made from the main
+thread.
+
+=back
+
 =head1 CONFIGURATION FILES
 
 In addition to the command line, warnings and other features may be
@@ -3636,6 +3802,21 @@ section for more details.
 Ignoring this warning will only slow simulations, it will simulate
 correctly.
 
+=item UNOPTTHREADS
+
+Warns that the thread scheduler was unable to partition the design to fill
+the requested number of threads.
+
+One workaround is to request fewer threads with C<--threads>.
+
+Another possible workaround is to allow more MTasks in the runtime, by
+increasing the value of --threads-max-mtasks. More MTasks will result in
+more communication and synchronization overhead at runtime; the scheduler
+attempts to minimize the number of MTasks for this reason.
+
+Ignoring this warning will only slow simulations, it will simulate
+correctly.
+
 =item UNPACKED
 
 Warns that unpacked structs and unions are not supported.
@@ -4185,6 +4366,8 @@ performance gain.
 
 In 2009, major SystemVerilog and DPI language support was added.
 
+In 2018, Verilator 4.000 was released with multithreaded support.
+
 Currently, various language features and performance enhancements are added
 as the need arises.  Verilator is now about 3x faster than in 2002, and is
 faster than many popular commercial simulators.
@@ -4282,7 +4465,7 @@ License Version 2.0.
 
 =head1 SEE ALSO
 
-L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
+L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,
 
 L<verilator --help> which is the source for this document,
 
diff --git a/bin/verilator_gantt b/bin/verilator_gantt
new file mode 100755
index 000000000..cf4bf6c65
--- /dev/null
+++ b/bin/verilator_gantt
@@ -0,0 +1,559 @@
+: # -*-Mode: perl;-*- use perl, wherever it is
+eval 'exec perl -wS $0 ${1+"$@"}'
+  if 0;
+# See copyright, etc in below POD section.
+######################################################################
+
+use strict;
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use vars qw ($Debug);
+
+$Debug = 0;
+my $Opt_File;
+my $Opt_Time_Per_Char = 0;  # rdtsc ticks per char in gantt chart, 0=auto
+my $opt_vcd = "profile_threads.vcd";
+
+our %Threads;
+our %Mtasks;
+our %Global;
+
+autoflush STDOUT 1;
+autoflush STDERR 1;
+Getopt::Long::config ("no_auto_abbrev");
+if (! GetOptions (
+          "help"        => \&usage,
+          "scale=i"     => \$Opt_Time_Per_Char,
+          "debug"       => sub { $Debug = 1; },
+          "vcd=s"       => \$opt_vcd,
+          "no-vcd!"     => sub { $opt_vcd = undef; },
+          "<>"          => \&parameter,
+    )) {
+    die "%Error: Bad usage, try 'verilator_gantt --help'\n";
+}
+
+$Opt_File = "profile_threads.dat" if !defined $Opt_File;
+
+process($Opt_File);
+write_vcd($opt_vcd) if defined $opt_vcd;
+exit(0);
+
+#######################################################################
+
+sub usage {
+    pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
+    exit (1);
+}
+
+sub parameter {
+    my $param = shift;
+    if (!defined $Opt_File) {
+        $Opt_File = $param;
+    } else {
+        die "%Error: Unknown parameter: $param\n";
+    }
+}
+
+#######################################################################
+
+sub process {
+    my $filename = shift;
+
+    read_data($filename);
+    report();
+}
+
+#######################################################################
+
+sub read_data {
+    my $filename = shift;
+
+    %Global = (rdtsc_cycle_time => 0);
+
+    my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
+    while (my $line = $fh->getline) {
+        if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
+            my $mtask = $1;
+            my $start = $2;
+            my $end = $3;
+            my $elapsed_time = $4;
+            my $predict_time = $5;
+            my $cpu = $6;
+            my $thread = $7;
+            $Threads{$thread}{$start}{mtask} = $mtask;
+            $Threads{$thread}{$start}{end} = $end;
+            $Threads{$thread}{$start}{cpu} = $cpu;
+
+            if (!exists $Mtasks{$mtask}{elapsed}) {
+                $Mtasks{$mtask}{elapsed} = 0;
+            }
+            $Mtasks{$mtask}{elapsed} += $elapsed_time;
+            $Mtasks{$mtask}{predict} = $predict_time;
+            $Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
+        }
+        elsif ($line =~ /^VLPROFTHREAD/) {}
+        elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
+               || $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
+            $Global{args}{$1} = $2;
+        }
+        elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
+            $Global{stats}{$1} = $2;
+        }
+        elsif ($line =~ /^#/) {}
+        elsif ($Debug) {
+            chomp $line;
+            print "Unk: $line\n";
+        }
+        # TODO -- this is parsing text printed by a client.
+        # Really, verilator proper should generate this
+        # if it's useful...
+        if ($line =~ m/rdtsc time = (\d+) ticks/) {
+            $Global{rdtsc_cycle_time} = $1;
+        }
+    }
+}
+
+sub report {
+    print "Verilator Gantt report\n";
+
+    print "\nArgument settings:\n";
+    foreach my $arg (sort keys %{$Global{args}}) {
+        my $plus = ($arg =~ /^\+/) ? "+" : " ";
+        printf "  %s%s%d\n", $arg, $plus, $Global{args}{$arg};
+    }
+
+    my $nthreads = scalar keys %Threads;
+    $Global{cpus}{cpu_time} = {};
+    foreach my $thread (keys %Threads) {
+        # Make potentially multiple characters per column
+        foreach my $start (keys %{$Threads{$thread}}) {
+            my $cpu = $Threads{$thread}{$start}{cpu};
+            my $elapsed = $Threads{$thread}{$start}{end} - $start;
+            $Global{cpus}{cpu_time}{$cpu} += $elapsed;
+        }
+    }
+
+    my $mt_mtask_time = 0;
+    my $long_mtask_time = 0;
+    my $last_end = 0;
+    foreach my $mtask (keys %Mtasks) {
+        $mt_mtask_time += $Mtasks{$mtask}{elapsed};
+        $last_end = max($last_end, $Mtasks{$mtask}{end});
+        $long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
+    }
+    $Global{last_end} = $last_end;
+
+    report_graph();
+
+    # If we know cycle time in the same (rdtsc) units,
+    # this will give us an actual utilization number,
+    # (how effectively we keep the cores busy.)
+    #
+    # It also gives us a number we can compare against
+    # serial mode, to estimate the overhead of data sharing,
+    # which will show up in the total elapsed time. (Overhead
+    # of synchronization and scheduling should not.)
+    print "\nAnalysis:\n";
+    printf "  Total threads             = %d\n", $nthreads;
+    printf "  Total mtasks              = %d\n", scalar (keys %Mtasks);
+    printf "  Total cpus used           = %d\n", scalar (keys %{$Global{cpus}});
+    printf "  Total yields              = %d\n", $Global{stats}{yields};
+    printf "  Total eval time           = %d rdtsc ticks\n", $Global{last_end};
+    printf "  Longest mtask time        = %d rdtsc ticks\n", $long_mtask_time;
+    printf "  All-thread mtask time     = %d rdtsc ticks\n", $mt_mtask_time;
+    my $long_efficiency = $long_mtask_time/($Global{last_end});
+    printf "  Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
+    my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
+    printf "  All-thread efficiency     = %0.1f%%\n", $mt_efficiency*100;
+    printf "  All-thread speedup        = %0.1f\n", $mt_efficiency*$nthreads;
+    if ($Global{rdtsc_cycle_time} > 0) {
+        my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
+        print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
+    }
+
+    my @p2e_ratios;
+    my $min_p2e = 1000000;
+    my $min_mtask;
+    my $max_p2e = -1000000;
+    my $max_mtask;
+    foreach my $mtask (sort keys %Mtasks) {
+        if ($Mtasks{$mtask}{elapsed} > 0) {
+            if ($Mtasks{$mtask}{predict} == 0) {
+                $Mtasks{$mtask}{predict} = 1;  # don't log(0) below
+            }
+            my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
+            #print "log(p2e $mtask) = $p2e_ratio   (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
+            push @p2e_ratios, $p2e_ratio;
+
+            if ($p2e_ratio > $max_p2e) {
+                $max_p2e = $p2e_ratio;
+                $max_mtask = $mtask;
+            }
+            if ($p2e_ratio < $min_p2e) {
+                $min_p2e = $p2e_ratio;
+                $min_mtask = $mtask;
+            }
+        }
+    }
+
+    print "\nStatistics:\n";
+    print "  min log(p2e) = $min_p2e  from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
+    print "  max log(p2e) = $max_p2e  from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
+
+    my $stddev = stddev(\@p2e_ratios);
+    my $mean = mean(\@p2e_ratios);
+    print "  mean = " . ($mean) . "\n";
+    print "  stddev = " . ($stddev) . "\n";
+    print "  e ^ stddev = " . exp($stddev). "\n";
+    print "\n";
+}
+
+sub report_graph {
+    my $time_per = $Opt_Time_Per_Char;
+    if ($time_per == 0) {
+        $time_per = ($Global{last_end} / 40);  # Start with 40 columns
+        while ($time_per > 10) {
+            my ($graph, $conflicts) = _make_graph($time_per);
+            last if !$conflicts;
+            $time_per = int($time_per/2);
+        }
+        # One more step so we can fit more labels
+        $time_per = int($time_per/2);
+    }
+
+    my ($graph, $conflicts) = _make_graph($time_per);
+
+    print "\nThread gantt graph:\n";
+    print "  Legend: One character width = $time_per rdtsc ticks\n";
+    print "  Legend: '&' = multiple mtasks in this period (character width)\n";
+
+    my $scale = "   <-".$Global{last_end}." rdtsc total";
+    for (my $col = length($scale);  # -2 for '->' below
+         $col < ($Global{last_end}/$time_per); ++$col) {
+        $scale .= "-";
+    }
+    print "  $scale->\n";
+
+    foreach my $thread (sort keys %{$graph}) {
+        print "  t: ";
+        _print_graph_line($graph->{$thread}, '');
+    }
+}
+
+sub _make_graph {
+    my $time_per = shift;
+
+    my $graph = {};  # {thread}{column}{char=>'x' or chars=>#}
+    my $conflicts = 0;
+    foreach my $thread (keys %Threads) {
+        # Make potentially multiple characters per column
+        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
+            my $end = $Threads{$thread}{$start}{end};
+            my $mtask = $Threads{$thread}{$start}{mtask};
+            my $cpu = $Threads{$thread}{$start}{cpu};
+
+            my $startcol = _time_col($time_per, $start);
+            my $endcol = _time_col($time_per, $end);
+
+            my $label = "[";
+            $label .= "$cpu";  # Maybe make optional in future
+            my $width = $endcol - $startcol + 1;
+            while (length($label) < ($width-1)) {  # -1 for ']'
+                $label .= "-";
+            }
+            $label .= "]";
+            $graph->{$thread}[$startcol]{char} .= $label;
+        }
+        if ($Debug) {
+            print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
+        }
+        # Expand line to one char per column
+        for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
+            if (my $chars = $graph->{$thread}[$col]{char}) {
+                my $ok = 1;
+                for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
+                    if ($graph->{$thread}[$col + $coladd]{char}) {
+                        $ok = 0; last;
+                    }
+                }
+                if (!$ok) {
+                    if ($chars =~ /\[.*\[/) {  # Two begins or more
+                        $conflicts++;
+                        $graph->{$thread}[$col]{char} = "&";
+                    } else {
+                        $graph->{$thread}[$col]{char} = "[";
+                    }
+                    for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
+                        if ($graph->{$thread}[$col + $coladd]{char}) {
+                            last;
+                        } else {
+                            $graph->{$thread}[$col + $coladd]{char} = 'x';
+                        }
+                    }
+                } else {
+                    my $coladd = 0;
+                    foreach my $char (split //, $chars) {
+                        $graph->{$thread}[$col+$coladd]{char} = $char;
+                        ++$coladd;
+                    }
+                }
+            }
+        }
+        if ($Debug) {
+            print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
+        }
+    }
+    print "# Conflicts $conflicts\n" if $Debug;
+    return ($graph, $conflicts);
+}
+
+sub _print_graph_line {
+    my $graph_thread = shift;
+    my $sep = shift;
+    for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
+        my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
+        print $c, $sep;
+    }
+    print "\n";
+}
+
+sub _time_col {
+    my $time_per = shift;
+    my $time = shift;
+    return int($time/$time_per);
+}
+
+#######################################################################
+
+sub write_vcd {
+    my $filename = shift;
+    print "Writing $filename\n";
+    my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
+    my $vcd = {values => {},  # {<time>}{<code>} = value
+               sigs => {},  # {<module>}{<sig}} = code
+               code => 0,
+    };
+
+    my %parallelism;
+    foreach my $thread (keys %Threads) {
+        my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
+        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
+            my $end = $Threads{$thread}{$start}{end};
+            my $mtask = $Threads{$thread}{$start}{mtask};
+            my $cpu = $Threads{$thread}{$start}{cpu};
+            $vcd->{values}{$start}{$mcode} = $mtask;
+            $vcd->{values}{$end}{$mcode} = undef;
+            $parallelism{$start}++;
+            $parallelism{$end}--;
+
+            my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
+            $vcd->{values}{$start}{$ccode} = $thread;
+            $vcd->{values}{$end}{$ccode} = undef;
+
+            my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
+            $vcd->{values}{$start}{$mcode} = $cpu;
+            $vcd->{values}{$end}{$mcode} = undef;
+        }
+    }
+    {
+        my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
+        my $value = 0;
+        foreach my $time (sort {$a<=>$b} keys %parallelism) {
+            $value += $parallelism{$time};
+            $vcd->{values}{$time}{$pcode} = $value;
+        }
+    }
+
+    $fh->print('$version Generated by verilator_gantt $end'."\n");
+    $fh->print('$timescale 1ns $end'."\n");
+    $fh->print("\n");
+
+    my %all_codes;
+    $fh->print(' $scope module gantt $end'."\n");
+    foreach my $module (sort keys %{$vcd->{sigs}}) {
+        $fh->printf('  $scope module %s $end'."\n", $module);
+        foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
+            my $code = $vcd->{sigs}{$module}{$sig};
+            $fh->printf('   $var wire 32 v%x %s [31:0] $end'."\n",
+                        $code, $sig);
+            $all_codes{$code} = 1;
+        }
+        $fh->print('  $upscope $end'."\n");
+    }
+    $fh->print(' $upscope $end'."\n");
+    $fh->print('$enddefinitions $end'."\n");
+    $fh->print("\n");
+
+    my $first = 1;
+    foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
+        if ($first) {
+            $first = 0;
+            # Start with Z for any signals without time zero data
+            foreach my $code (keys %all_codes) {
+                if (!defined $vcd->{values}{$time}{$code}) {
+                    $vcd->{values}{$time}{$code} = undef;
+                }
+            }
+        }
+        $fh->printf("#%d\n", $time);
+        foreach my $code (sort keys %{$vcd->{values}{$time}}) {
+            my $value = $vcd->{values}{$time}{$code};
+            if (defined $value) {
+                $fh->printf("b%b v%x\n", $value, $code);
+            } else {
+                $fh->printf("bz v%x\n", $code);
+            }
+        }
+    }
+}
+
+#######################################################################
+# Similar to Statistics::Basic functions, but avoid a package dependency
+
+sub max {
+    my $n = $_[0]; shift;
+    while (defined $_[0]) {
+        $n = $_[0] if !defined $n || $_[0] > $n;
+        shift;
+    }
+    return $n;
+}
+
+sub mean {
+    my $arrayref = shift;
+    my $n = 0;
+    my $sum = 0;
+    foreach my $v (@$arrayref) {
+        $sum += $v;
+        $n++;
+    }
+    return undef if !$n;
+    return $sum/$n;
+}
+
+sub stddev {
+    my $arrayref = shift;
+    my $n = 0;
+    my $sum = 0;
+    my $sumsq = 0;
+    foreach my $v (@$arrayref) {
+        $sum += $v;
+        $sumsq += $v**2;
+        $n++;
+    }
+    return undef if !$n;
+    return sqrt(($sumsq/$n) - ($sum/$n)**2);
+}
+
+#######################################################################
+__END__
+
+=pod
+
+=head1 NAME
+
+verilator_gantt - Create Gantt chart of multi-threaded execution
+
+=head1 SYNOPSIS
+
+Creates a visual representation to help analyze Verilator multithreaded
+simulation performance, by showing when each macro-task starts and ends,
+and showing when each thread is busy or idle.
+
+The generated Gantt chart has time on the X-axis. Times shown are to the
+scale printed, i.e. a certain about of time for each character width.  The
+Y-axis shows threads, each thread's execution is shown on one line.  That
+line shows "[" at the position in time when it executes.
+
+Following the "[" is the cpu number the task executed on, followed by zero
+or more "-" to make the width of the characters match the scaled execution
+time, followed by a "]".  If the scale is too small, the cpu number and
+mtask number will not be printed.  If the scale is very small, a "&"
+indicates multiple mtasks started at that time position.
+
+Also creates a value change dump (VCD) format dump file which may be viewed
+in a waveform viewer (e.g. C<GTKWave>).  See below.
+
+=head1 USAGE
+
+  Build with --prof-threads.
+
+  Run a sim with +verilator+prof+threads+window 2.
+
+  This will create profile_threads.dat.
+
+  Then run:
+
+  verilator_gantt profile_threads.dat
+
+  The report will be printed on standard output, this also generates
+  profile_threads.vcd
+
+  View profile_threads.vcd in a waveform viewer.
+
+=head1 VCD SIGNALS
+
+In waveforms there are the following signals. Most signals the "decimal"
+format will remove the leading zeros and make the traces easier to read.
+
+parallelism: The number of mtasks active at this time, for best performance
+this will match the thread count. You may want to use an "analog step"
+format to view this signal.
+
+cpu#_thread: For the given CPU number, the thread number executing.
+
+mtask#_cpu; For the given mtask id, the CPU it is executing on.
+
+thread#_mtask: For the given thread number, the mtask id executing.
+
+=head1 ARGUMENTS
+
+=over 4
+
+=item I<filename>
+
+The filename to read data from, defaults to "profile_threads.dat".
+
+=item --help
+
+Displays this message and program version and exits.
+
+=item --scale I<n>
+
+On the X-axis of the generated Gantt chart, each character represents this
+many time units. (On x86, time units are rdtsc ticks.)  Defaults to 0,
+which will automatically compute a reasonable scale where no two mtasks
+need to fit into same character width's worth of scaled time.
+
+=item --no-vcd
+
+=item --vcd I<filename>
+
+Set output filename for vcd dump, or disable. Default is
+verilator_gantt.vcd.
+
+=back
+
+=head1 DISTRIBUTION
+
+The latest version is available from L<http://www.veripool.org/>.
+
+Copyright 2018-2018 by Wilson Snyder.  Verilator is free software; you can
+redistribute it and/or modify it under the terms of either the GNU Lesser
+General Public License Version 3 or the Perl Artistic License Version 2.0.
+
+=head1 AUTHORS
+
+Wilson Snyder <wsnyder@wsnyder.org>
+
+=head1 SEE ALSO
+
+C<verilator>
+
+=cut
+
+######################################################################
+### Local Variables:
+### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
+### End:
diff --git a/include/verilated.cpp b/include/verilated.cpp
index ae4fb1aeb..7ba4571b3 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;
 
 // Keep below together in one cache line
 Verilated::Serialized Verilated::s_s;
+Verilated::NonSerialized Verilated::s_ns;
 VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
 
 Verilated::CommandArgValues Verilated::s_args;
@@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
     s_fatalOnVpiError = true; // retains old default behaviour
 }
 
+Verilated::NonSerialized::NonSerialized() {
+    s_profThreadsStart = 1;
+    s_profThreadsWindow = 2;
+    s_profThreadsFilenamep = strdup("profile_threads.dat");
+}
+Verilated::NonSerialized::~NonSerialized() {
+    if (s_profThreadsFilenamep) {
+        free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
+    }
+}
+
 //===========================================================================
 // Random reset -- Only called at init time, so don't inline.
 
@@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
     VerilatedLockGuard lock(m_mutex);
     s_s.s_fatalOnVpiError = flag;
 }
+void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    s_ns.s_profThreadsStart = flag;
+}
+void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    s_ns.s_profThreadsWindow = flag;
+}
+void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
+    VerilatedLockGuard lock(m_mutex);
+    if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
+    s_ns.s_profThreadsFilenamep = strdup(flagp);
+}
+
 
 const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
     // Returns new'ed data
@@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
             VL_PRINTF_MT("For help, please see 'verilator --help'\n");
             VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
         }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
+            Verilated::profThreadsStart(atoll(value.c_str()));
+        }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
+            Verilated::profThreadsWindow(atol(value.c_str()));
+        }
+        else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
+            Verilated::profThreadsFilenamep(value.c_str());
+        }
         else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
             Verilated::randReset(atoi(value.c_str()));
         }
diff --git a/include/verilated.h b/include/verilated.h
index a5340cc4f..531ccf2e0 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -344,6 +344,17 @@ class Verilated {
         ~Serialized() {}
     } s_s;
 
+    static struct NonSerialized {  // Non-serialized information
+        // These are reloaded from on command-line settings, so do not need to persist
+        // Fast path
+        vluint64_t s_profThreadsStart;  ///< +prof+threads starting time
+        vluint32_t s_profThreadsWindow;  ///< +prof+threads window size
+        // Slow path
+        const char* s_profThreadsFilenamep;  ///< +prof+threads filename
+        NonSerialized();
+        ~NonSerialized();
+    } s_ns;
+
     // no need to be save-restored (serialized) the
     // assumption is that the restore is allowed to pass different arguments
     static struct CommandArgValues {
@@ -409,6 +420,14 @@ public:
     /// Enable/disable vpi fatal
     static void fatalOnVpiError(bool flag) VL_MT_SAFE;
     static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
+    /// --prof-threads related settings
+    static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
+    static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
+    static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
+    static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
+    static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
+    static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
+
     /// Flush callback for VCD waves
     static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
     static void flushCall() VL_MT_SAFE;
diff --git a/include/verilated_threads.cpp b/include/verilated_threads.cpp
new file mode 100644
index 000000000..fccb4dffd
--- /dev/null
+++ b/include/verilated_threads.cpp
@@ -0,0 +1,229 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// THIS MODULE IS PUBLICLY LICENSED
+//
+// Copyright 2012-2018 by Wilson Snyder.  This program is free software;
+// you can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
+//
+// This is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+//=============================================================================
+///
+/// \file
+/// \brief Thread pool for verilated modules
+///
+//=============================================================================
+
+#include "verilatedos.h"
+#include "verilated_threads.h"
+#include <cstdio>
+
+std::atomic<vluint64_t> VlNotification::s_yields;
+
+VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
+
+//=============================================================================
+// VlMTaskVertex
+
+VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
+    : m_upstreamDepsDone(0),
+      m_upstreamDepCount(upstreamDepCount) {
+    assert(atomic_is_lock_free(&m_upstreamDepsDone));
+}
+
+//=============================================================================
+// VlWorkerThread
+
+VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
+    : m_poolp(poolp)
+    , m_profiling(profiling)
+    , m_exiting(false)
+      // Must init this last -- after setting up fields that it might read:
+    , m_cthread(startWorker, this) {}
+
+VlWorkerThread::~VlWorkerThread() {
+    m_exiting.store(true, std::memory_order_release);
+    {
+        VerilatedLockGuard lk(m_mutex);
+        if (sleeping()) {
+            wakeUp();
+        }
+    }
+    // The thread should exit; join it.
+    m_cthread.join();
+}
+
+void VlWorkerThread::workerLoop() {
+    if (VL_UNLIKELY(m_profiling)) {
+        m_poolp->setupProfilingClientThread();
+    }
+
+    VlNotification alarm;
+    ExecRec work;
+    work.m_fnp = NULL;
+
+    while (1) {
+        bool sleep = false;
+        if (VL_UNLIKELY(!work.m_fnp)) {
+            // Look for work
+            VerilatedLockGuard lk(m_mutex);
+            if (VL_LIKELY(!m_ready.empty())) {
+                dequeWork(&work);
+            } else {
+                // No work available, prepare to sleep. Pass alarm/work
+                // into m_sleepAlarm so wakeUp will tall this function.
+                //
+                // Must modify m_sleepAlarm in the same critical section as
+                // the check for ready work, otherwise we could race with
+                // another thread enqueueing work and never be awoken.
+                m_sleepAlarm.first = &alarm;
+                m_sleepAlarm.second = &work;
+                sleep = true;
+            }
+        }
+
+        // Do this here, not above, to avoid a race with the destructor.
+        if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
+            break;
+
+        if (VL_UNLIKELY(sleep)) {
+            alarm.waitForNotification();  // ZZZzzzzz
+            alarm.reset();
+        }
+        if (VL_LIKELY(work.m_fnp)) {
+            work.m_fnp(work.m_evenCycle, work.m_sym);
+            work.m_fnp = NULL;
+        }
+    }
+
+    if (VL_UNLIKELY(m_profiling)) {
+        m_poolp->tearDownProfilingClientThread();
+    }
+}
+
+void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
+    workerp->workerLoop();
+}
+
+//=============================================================================
+// VlThreadPool
+
+VlThreadPool::VlThreadPool(int nThreads, bool profiling)
+    : m_profiling(profiling) {
+    // --threads N passes nThreads=N-1, as the "main" threads counts as 1
+    unsigned cpus = std::thread::hardware_concurrency();
+    if (cpus < nThreads+1) {
+        VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
+                     " --threads %d; may run slow.\n", cpus, nThreads+1);
+    }
+    // Create'em
+    for (int i=0; i<nThreads; ++i) {
+        m_workers.push_back(new VlWorkerThread(this, profiling));
+    }
+    // Set up a profile buffer for the current thread too -- on the
+    // assumption that it's the same thread that calls eval and may be
+    // donated to run mtasks during the eval.
+    if (VL_UNLIKELY(m_profiling)) {
+        setupProfilingClientThread();
+    }
+}
+
+VlThreadPool::~VlThreadPool() {
+    for (int i = 0; i < m_workers.size(); ++i) {
+        // Each ~WorkerThread will wait for its thread to exit.
+        delete m_workers[i];
+    }
+    if (VL_UNLIKELY(m_profiling)) {
+        tearDownProfilingClientThread();
+    }
+}
+
+void VlThreadPool::tearDownProfilingClientThread() {
+    assert(t_profilep);
+    delete t_profilep;
+    t_profilep = NULL;
+}
+
+void VlThreadPool::setupProfilingClientThread() {
+    assert(!t_profilep);
+    t_profilep = new ProfileTrace;
+    // Reserve some space in the thread-local profiling buffer;
+    // try not to malloc while collecting profiling.
+    t_profilep->reserve(4096);
+    {
+        VerilatedLockGuard lk(m_mutex);
+        m_allProfiles.insert(t_profilep);
+    }
+}
+
+void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
+    VerilatedLockGuard lk(m_mutex);
+    for (ProfileSet::iterator it = m_allProfiles.begin();
+         it != m_allProfiles.end(); ++it) {
+        // Every thread's profile trace gets a copy of rec.
+        (*it)->emplace_back(rec);
+    }
+}
+
+void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
+    VerilatedLockGuard lk(m_mutex);
+    VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
+
+    FILE* fp = fopen(filenamep, "w");
+    if (VL_UNLIKELY(!fp)) {
+        VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
+        return;
+    }
+
+    // TODO Perhaps merge with verilated_coverage output format, so can
+    // have a common merging and reporting tool, etc.
+    fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
+    fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
+            vluint64_t(m_workers.size()+1));
+    fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
+            Verilated::profThreadsStart());
+    fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
+            Verilated::profThreadsWindow());
+    fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
+            VlNotification::yields());
+
+    vluint32_t thread_id = 0;
+    for (ProfileSet::iterator pit = m_allProfiles.begin();
+         pit != m_allProfiles.end(); ++pit) {
+        ++thread_id;
+
+        bool printing = false;  // False while in warmup phase
+        for (ProfileTrace::iterator eit = (*pit)->begin();
+             eit != (*pit)->end(); ++eit) {
+            switch (eit->m_type) {
+            case VlProfileRec::TYPE_BARRIER:
+                printing = true;
+                break;
+            case VlProfileRec::TYPE_MTASK_RUN:
+                if (!printing) break;
+                fprintf(fp, "VLPROF mtask %d"
+                        " start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
+                        " predict_time %u cpu %u on thread %u\n",
+                        eit->m_mtaskId,
+                        eit->m_startTime,
+                        eit->m_endTime,
+                        (eit->m_endTime - eit->m_startTime),
+                        eit->m_predictTime,
+                        eit->m_cpu,
+                        thread_id);
+                break;
+            default: assert(false);
+                break;
+            }
+        }
+    }
+    fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
+            ticksElapsed);
+
+    fclose(fp);
+}
diff --git a/include/verilated_threads.h b/include/verilated_threads.h
new file mode 100644
index 000000000..f935fb784
--- /dev/null
+++ b/include/verilated_threads.h
@@ -0,0 +1,313 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//=============================================================================
+//
+// THIS MODULE IS PUBLICLY LICENSED
+//
+// Copyright 2012-2018 by Wilson Snyder.  This program is free software;
+// you can redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
+//
+// This is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+//=============================================================================
+///
+/// \file
+/// \brief Thread pool and profiling for Verilated modules
+///
+//=============================================================================
+
+#ifndef _VERILATED_THREADS_H_
+#define _VERILATED_THREADS_H_
+
+#include "verilatedos.h"
+
+#include <atomic>
+#include <thread>
+#include <vector>
+#include <set>
+#include <sched.h>  // For sched_getcpu()
+
+#include "verilated.h"  // for VerilatedMutex and clang annotations
+
+// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
+// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
+// as a void* here.
+typedef void* VlThrSymTab;
+
+class VlNotification {
+    // MEMBERS
+    std::atomic<bool> m_notified;  // Notification pending
+    static std::atomic<vluint64_t> s_yields;  // Statistics
+
+public:
+    // CONSTRUCTORS
+    VlNotification()
+        : m_notified(false) {
+        assert(atomic_is_lock_free(&m_notified));
+    }
+    ~VlNotification() {}
+
+    // METHODS
+    static vluint64_t yields() { return s_yields; }
+
+    // Block until notify() has occurred, then return.
+    // If notify() has already occurred, return immediately.
+    //
+    // This is logically const: the object will remain in notified state
+    // after WaitForNotification() returns, so you could notify more than
+    // one thread of the same event.
+    inline void waitForNotification() {
+        unsigned ct = 0;
+        while (VL_UNLIKELY(!notified())) {
+            VL_CPU_RELAX();
+            ct++;
+            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
+                ct = 0;
+                ++s_yields;  // Statistics
+                std::this_thread::yield();
+            }
+        }
+    }
+
+    // The 'inline' keyword here means nothing to the compiler, it's
+    // implicit on methods defined within the class body anyway.
+    //
+    // 'inline' is attached the this method, and others in this file,
+    // to remind humans that some routines in this file are called many
+    // times per cycle in threaded mode. Such routines should be
+    // inlinable; that's why they're declared in the .h and not the .cpp.
+    inline bool notified() {
+        return m_notified.load(std::memory_order_acquire);
+    }
+    // Set notified state. If state is already notified,
+    // it remains so.
+    inline void notify() {
+        m_notified.store(true, std::memory_order_release);
+    }
+    // Reset the state to un-notified state, which is also the
+    // state of a new Notification object.
+    inline void reset() {
+        m_notified.store(false, std::memory_order_relaxed);
+    }
+};
+
+typedef void (*VlExecFnp)(bool, VlThrSymTab);
+
+/// Track dependencies for a single MTask.
+class VlMTaskVertex {
+    // MEMBERS
+
+    // On even cycles, _upstreamDepsDone increases as upstream
+    // dependencies complete. When it reaches _upstreamDepCount,
+    // this MTaskVertex is ready.
+    //
+    // On odd cycles, _upstreamDepsDone decreases as upstream
+    // dependencies complete, and when it reaches zero this MTaskVertex
+    // is ready.
+    //
+    // An atomic is smaller than a mutex, and lock-free.
+    //
+    // (Why does the size of this class matter? If an mtask has many
+    // downstream mtasks to notify, we hope these will pack into a
+    // small number of cache lines to reduce the cost of pointer chasing
+    // during done-notification. Nobody's quantified that cost though.
+    // If we were really serious about shrinking this class, we could
+    // use 16-bit types here...)
+    std::atomic<vluint32_t> m_upstreamDepsDone;
+    const vluint32_t m_upstreamDepCount;
+
+public:
+    // CONSTRUCTORS
+
+    // 'upstreamDepCount' is the number of upstream MTaskVertex's
+    // that must notify this MTaskVertex before it will become ready
+    // to run.
+    explicit VlMTaskVertex(vluint32_t upstreamDepCount);
+    ~VlMTaskVertex() {}
+
+    // Upstream mtasks must call this when they complete.
+    // Returns true when the current MTaskVertex becomes ready to execute,
+    // false while it's still waiting on more dependencies.
+    inline bool signalUpstreamDone(bool evenCycle) {
+        if (evenCycle) {
+            vluint32_t upstreamDepsDone
+                = 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
+            assert(upstreamDepsDone <= m_upstreamDepCount);
+            return (upstreamDepsDone == m_upstreamDepCount);
+        } else {
+            vluint32_t upstreamDepsDone_prev
+                = m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
+            assert(upstreamDepsDone_prev > 0);
+            return (upstreamDepsDone_prev == 1);
+        }
+    }
+    inline bool areUpstreamDepsDone(bool evenCycle) const {
+        vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
+        return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
+    }
+    inline void waitUntilUpstreamDone(bool evenCycle) const {
+        while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
+            VL_CPU_RELAX();
+        }
+    }
+};
+
+// Profiling support
+class VlProfileRec {
+protected:
+    friend class VlThreadPool;
+    enum VlProfileE {
+        TYPE_MTASK_RUN,
+        TYPE_BARRIER
+    };
+    VlProfileE m_type;  // Record type
+    vluint32_t m_mtaskId;  // Mtask we're logging
+    vluint32_t m_predictTime;  // How long scheduler predicted would take
+    vluint64_t m_startTime;  // Tick at start of execution
+    vluint64_t m_endTime;  // Tick at end of execution
+    unsigned m_cpu;  // Execution CPU number (at start anyways)
+public:
+    class Barrier {};
+    VlProfileRec() {}
+    explicit VlProfileRec(Barrier) {
+        m_type = TYPE_BARRIER;
+        m_mtaskId = 0;
+        m_predictTime = 0;
+        m_startTime = 0;
+        m_cpu = sched_getcpu();
+    }
+    void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
+        m_type = VlProfileRec::TYPE_MTASK_RUN;
+        m_mtaskId = mtask;
+        m_predictTime = predict;
+        m_startTime = time;
+        m_cpu = sched_getcpu();
+    }
+    void endRecord(vluint64_t time) {
+        m_endTime = time;
+    }
+};
+
+class VlThreadPool;
+
+class VlWorkerThread {
+private:
+    // TYPES
+    struct ExecRec {
+        VlExecFnp m_fnp;  // Function to execute
+        VlThrSymTab m_sym;  // Symbol table to execute
+        bool m_evenCycle;  // Even/odd for flag alternation
+        ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
+        ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
+            : m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
+    };
+
+    // MEMBERS
+    VerilatedMutex m_mutex;
+
+    // Why a vector? We expect the pending list to be very short, typically
+    // 0 or 1 or 2, so popping from the front shouldn't be
+    // expensive. Revisit if we ever have longer queues...
+    std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
+
+    VlThreadPool* m_poolp;  // Our associated thread pool
+
+    // If values stored are non-NULL, the thread is asleep pending new
+    // work. If the thread is not asleep, both parts of m_sleepAlarm must
+    // be NULL.
+    std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
+
+    bool m_profiling;  // Is profiling enabled?
+    std::atomic<bool> m_exiting;  // Worker thread should exit
+    std::thread m_cthread;  // Underlying C++ thread record
+
+    VL_UNCOPYABLE(VlWorkerThread);
+
+public:
+    // CONSTRUCTORS
+    explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
+    ~VlWorkerThread();
+
+    // METHODS
+    inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
+        // As noted above this is inefficient if our ready list is ever
+        // long (but it shouldn't be)
+        *workp = m_ready.front();
+        m_ready.erase(m_ready.begin());
+    }
+    inline void wakeUp() VL_REQUIRES(m_mutex) {
+        VlNotification* notifyp = m_sleepAlarm.first;
+        m_sleepAlarm.first = NULL;  // NULL+NULL means wake
+        m_sleepAlarm.second = NULL;
+        notifyp->notify();
+    }
+    inline bool sleeping() VL_REQUIRES(m_mutex) {
+        return (m_sleepAlarm.first != NULL);
+    }
+    inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
+        VerilatedLockGuard lk(m_mutex);
+        m_ready.emplace_back(fnp, evenCycle, sym);
+        if (VL_LIKELY(sleeping())) {  // Generally queue is waiting for work
+            // Awaken thread
+            dequeWork(m_sleepAlarm.second);
+            wakeUp();
+        }
+    }
+    void workerLoop();
+    static void startWorker(VlWorkerThread* workerp);
+};
+
+class VlThreadPool {
+    // TYPES
+    typedef std::vector<VlProfileRec> ProfileTrace;
+    typedef std::set<ProfileTrace*> ProfileSet;
+
+    // MEMBERS
+    std::vector<VlWorkerThread*> m_workers;  // our workers
+    bool m_profiling;  // is profiling enabled?
+
+    // Support profiling -- we can append records of profiling events
+    // to this vector with very low overhead, and then dump them out
+    // later. This prevents the overhead of printf/malloc/IO from
+    // corrupting the profiling data. It's super cheap to append
+    // a VlProfileRec struct on the end of a pre-allocated vector;
+    // this is the only cost we pay in real-time during a profiling cycle.
+    static VL_THREAD_LOCAL ProfileTrace* t_profilep;
+    ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
+    VerilatedMutex m_mutex;
+
+public:
+    // CONSTRUCTORS
+    // Construct a thread pool with 'nThreads' dedicated threads. The thread
+    // pool will create these threads and make them available to execute tasks
+    // via this->workerp(index)->addTask(...)
+    VlThreadPool(int nThreads, bool profiling);
+    ~VlThreadPool();
+
+    // METHODS
+    inline int numThreads() const {
+        return m_workers.size();
+    }
+    inline VlWorkerThread* workerp(int index) {
+        assert(index >= 0);
+        assert(index < m_workers.size());
+        return m_workers[index];
+    }
+    inline VlProfileRec* profileAppend() {
+        t_profilep->emplace_back();
+        return &(t_profilep->back());
+    }
+    void profileAppendAll(const VlProfileRec& rec);
+    void profileDump(const char* filenamep, vluint64_t ticksElapsed);
+    // In profiling mode, each executing thread must call
+    // this once to setup profiling state:
+    void setupProfilingClientThread();
+    void tearDownProfilingClientThread();
+private:
+    VL_UNCOPYABLE(VlThreadPool);
+};
+
+#endif
diff --git a/internals.pod b/internals.pod
index 527cb7bc5..811f15bb5 100644
--- a/internals.pod
+++ b/internals.pod
@@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.
 
 =back
 
+=head2 Multithreaded Mode
+
+In --threads mode, the frontend of the Verilator pipeline is the same as
+serial mode, up until V3Order.
+
+V3Order builds a fine-grained, statement-level dependency graph that governs
+the ordering of code within a single eval() call. In serial mode, that
+dependency graph is used to order all statements into a total serial order.
+In parallel mode, the same dependency graph is the starting point for a
+partitioner (V3Partition).
+
+The partitioner's goal is to coarsen the fine-grained DAG into a coarser
+DAG, while maintaining as much available parallelism as possible. Often the
+partitioner can transform an input graph with millions of nodes into a
+coarsened execution graph with a few dozen nodes, while maintaining enough
+parallelism to take advantage of a modern multicore CPU. Runtime
+synchronization cost is not prohibitive with so few nodes.
+
+=head3 Partitioning
+
+Our partitioner is similar to the one Vivek Sarkar described in his 1989
+paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
+
+Let's define some terms:
+
+=over 4
+
+=item C<Par Factor>
+
+The available parallelism or "par-factor" of a DAG is the total cost to
+execute all nodes, divided by the cost to execute the longest critical path
+through the graph. This is the speedup you would get from running the graph
+in parallel, if given infinite CPU cores available and communication and
+synchronization are zero.
+
+=item C<Macro Task>
+
+When the partitioner coarsens the graph, it combines nodes together. Each
+fine-grained node represents an atomic "task"; combined nodes in the
+coarsened graph are "macro-tasks". This term comes from Sarkar. Each
+macro-task executes from start to end on one processor, without any
+synchronization to any other macro-task during its
+execution. (Synchronization only happens before the macro-task begins or
+after it ends.)
+
+=item C<Edge Contraction>
+
+Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
+coarsen the graph. It starts with one macro-task per atomic task and
+iteratively combines pairs of edge-connected macro-tasks.
+
+=item C<Local Critical Path>
+
+Each node in the graph has a "local" critical path. That's the critical
+path from the start of the graph to the start of the node, plus the node's
+cost, plus the critical path from the end of the node to the end of the
+graph.
+
+=back
+
+Sarkar calls out an important trade-off: coarsening the graph reduces
+runtime synchronization overhead among the macro-tasks, but it tends to
+increase the critical path through the graph and thus reduces par-factor.
+
+Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
+that the growth in critical path is minimized. Each candidate merge would
+result in a new node, which would have some local critical path. We choose
+the candidate that would produce the shortest local critical path. Repeat
+until par-factor falls to a target threshold. It's a greedy algorithm, and
+it's not guaranteed to produce the best partition (which Sarkar proves is
+NP-hard).
+
+=head3 Estimating Logic Costs
+
+To compute the cost of any given path through the graph, Verilator
+estimates an execution cost for each task. Each macro-task has an execution
+cost which is simply the sum of its tasks' costs. We assume that
+communication overhead and synchronization overhead are zero, so the cost
+of any given path through the graph is simply the sum of macro-task
+execution costs. Sarkar does almost the same thing, except that he has
+nonzero estimates for synchronization costs.
+
+Verilator's cost estimates are assigned by the InstrCountCostVisitor.  This
+class is perhaps the most fragile piece of the multithread implementation.
+It's easy to have a bug where you count something cheap (eg. accessing one
+element of a huge array) as if it were expensive (eg. by counting it as if
+it were an access to the entire array.) Even without such gross bugs, the
+estimates this produce are only loosely predictive of actual runtime cost.
+Multithread performance would be better with better runtime costs
+estimates.  This is an area to improve.
+
+=head3 Scheduling Macro-Tasks at Runtime
+
+After coarsening the graph, we must schedule the macro-tasks for runtime.
+Sarkar describes two options: you can dynamically schedule tasks at
+runtime, with a runtime graph follower. Sarkar calls this the
+"macro-dataflow model."  Verilator does not support this; early experiments
+with this approach had poor performance.
+
+The other option is to statically assign macro-tasks to threads, with each
+thread running its macro-tasks in a static order. Sarkar describes this in
+Chapter 5. Verilator takes this static approach. The only dynamic aspect is
+that each macro task may block before starting, to wait until its
+prerequisites on other threads have finished.
+
+The synchronization cost is cheap if the prereqs are done. If they're not,
+fragmentation (idle CPU cores waiting) is possible. This is the major
+source of overhead in this approach. The --prof-threads switch and the
+C<verilator_gantt> script can visualize the time lost to such
+fragmentation.
+
+=head3 Locating Variables for Best Spatial Locality
+
+After scheduling all code, we attempt to locate variables in memory such
+that variables accessed by a single macro-task are close together in
+memory.  This provides "spatial locality" -- when we pull in a 64-byte
+cache line to access a 2-byte variable, we want the other 62 bytes to be
+ones we'll also likely access soon, for best cache performance.
+
+This turns out to be critical for performance. It should allow Verilator
+to scale to very large models. We don't rely on our working set fitting
+in any CPU cache; instead we essentially "stream" data into caches from
+memory. It's not literally streaming, where the address increases
+monotonically, but it should have similar performance characteristics,
+so long as each macro-task's dataset fits in one core's local caches.
+
+To achieve spatial locality, we tag each variable with the set of
+macro-tasks that access it. Let's call this set the "footprint" of that
+variable. The variables in a given module have a set of footprints. We can
+order those footprints to minimize the distance between them (distance is
+the number of macro-tasks that are different across any two footprints) and
+then emit all variables into the struct in ordered-footprint order.
+
+The footprint ordering is literally the traveling salesman problem, and we
+use a TSP-approximation algorithm to get close to an optimal sort.
+
+This is an old idea. Simulators designed at DEC in the early 1990s used
+similar techniques to optimize both single-thread and multi-thread modes.
+(Verilator does not optimize variable placement for spatial locality in
+serial mode; that is a possible area for improvement.)
+
+=head3 Improving Multithreaded Performance Further (a TODO list)
+
+=over 4
+
+=item C<Wave Scheduling>
+
+To allow the verilated model to run in parallel with the testbench, it
+might be nice to support "wave" scheduling, in which work on a cycle begins
+before eval() is called or continues after eval() returns.  For now all
+work on a cycle happens during the eval() call, leaving Verilator's threads
+idle while the testbench (everything outside eval()) is working. This would
+involve fundamental changes within the partitioner, however, it's probably
+the best bet for hiding testbench latency.
+
+=item C<Efficient Dynamic Scheduling>
+
+To scale to more than a few threads, we may revisit a fully dynamic
+scheduler. For large (>16 core) systems it might make sense to dedicate an
+entire core to scheduling, so that scheduler data structures would fit in
+its L1 cache and thus the cost of traversing priority-ordered ready lists
+would not be prohibitive.
+
+=item C<Static Scheduling with Runtime Repack>
+
+We could modify the static scheduling approach by gathering actual
+macro-task execution times at run time, and dynamically re-packing the
+macro-tasks into the threads also at run time. Say, re-pack once every
+10,000 cycles or something. This has the potential to do better than our
+static estimates about macro-task run times. It could potentially react to
+CPU cores that aren't performing equally, due to NUMA or thermal throttling
+or nonuniform competing memory traffic or whatever.
+
+=item C<Clock Domain Balancing>
+
+Right now Verilator makes no attempt to balance clock domains across
+macro-tasks. For a multi-domain model, that could lead to bad gantt chart
+fragmentation. This could be improved if it's a real problem in practice.
+
+=item C<Other Forms of MTask Balancing>
+
+The largest source of runtime overhead is idle CPUs, which happens due to
+variance between our predicted runtime for each MTask and its actual
+runtime. That variance is magnified if MTasks are homogeneous, containing
+similar repeating logic which was generally close together in source code
+and which is still packed together even after going through Verilator's
+digestive tract.
+
+If Verilator could avoid doing that, and instead would take source logic
+that was close together and distribute it across MTasks, that would
+increase the diversity of any given MTask, and this should reduce variance
+in the cost estimates.
+
+One way to do that might be to make various "tie breaker" comparison
+routines in the sources to rely more heavily on randomness, and generally
+try harder not to keep input nodes together when we have the option to
+scramble things.
+
+=item C<Performance Regression>
+
+It would be nice if we had a regression of large designs, with some
+diversity of design styles, to test on both single- and multi-threaded
+modes. This would help to avoid performance regressions, and also to
+evaluate the optimizations while minimizing the impact of parasitic noise.
+
+=item C<Per-Instance Classes>
+
+If we have multiple instances of the same module, and they partition
+differently (likely; we make no attempt to partition them the same) then
+the variable sort will be suboptimal for either instance.  A possible
+improvement would be to emit a unique class for each instance of a module,
+and sort its variables optimally for that instance's code stream.
+
+=back
+
 =head2 Verilated Flow
 
 The evaluation loop outputted by Verilator is designed to allow a single
diff --git a/nodist/install_test b/nodist/install_test
index 77bf18cef..e8afbb1ea 100755
--- a/nodist/install_test
+++ b/nodist/install_test
@@ -64,6 +64,7 @@ sub test {
 	run("test -e $prefix/bin/verilator");
 	run("test -e $prefix/bin/verilator_bin");
 	run("test -e $prefix/bin/verilator_bin_dbg");
+        run("test -e $prefix/bin/verilator_gantt");
 	run("test -e $prefix/bin/verilator_profcfunc");
     }
 
diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in
index 3780643e0..950bbbc4e 100644
--- a/src/Makefile_obj.in
+++ b/src/Makefile_obj.in
@@ -217,6 +217,7 @@ RAW_OBJS = \
 	V3Order.o \
 	V3Os.o \
 	V3Param.o \
+	V3Partition.o \
 	V3PreShell.o \
 	V3Premit.o \
 	V3Reloop.o \
diff --git a/src/V3Ast.h b/src/V3Ast.h
index bb2b42198..c0325672e 100644
--- a/src/V3Ast.h
+++ b/src/V3Ast.h
@@ -29,16 +29,24 @@
 #include <vector>
 #include <cmath>
 #include <map>
+#include VL_INCLUDE_UNORDERED_SET
 
 #include "V3Ast__gen_classes.h"	// From ./astgen
 // Things like:
 //   class V3AstNode;
 
+// Forward declarations
+class V3Graph;
+class ExecMTask;
+
 // Hint class so we can choose constructors
 class VFlagLogicPacked {};
 class VFlagBitPacked {};
 class VFlagChildDType {};  // Used by parser.y to select constructor that sets childDType
 
+// Used as key for another map, needs operator<, hence not an unordered_set
+typedef std::set<int> MTaskIdSet;  // Set of mtaskIds for Var sorting
+
 //######################################################################
 
 // For broken() function, return error string if have a match
diff --git a/src/V3AstNodes.cpp b/src/V3AstNodes.cpp
index 077fa1f38..7e0a178b8 100644
--- a/src/V3AstNodes.cpp
+++ b/src/V3AstNodes.cpp
@@ -31,6 +31,8 @@
 #include "V3Ast.h"
 #include "V3File.h"
 #include "V3Global.h"
+#include "V3Graph.h"
+#include "V3PartitionGraph.h"  // Just for mtask dumping
 
 //======================================================================
 // Special methods
@@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
     }
 }
 
+AstExecGraph::AstExecGraph(FileLine* fileline)
+    : AstNode(fileline) {
+    m_depGraphp = new V3Graph;
+}
+AstExecGraph::~AstExecGraph() {
+    delete m_depGraphp; VL_DANGLING(m_depGraphp);
+}
+
 bool AstVar::isSigPublic() const {
     return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
 }
-
 bool AstVar::isScQuad() const {
     return (isSc() && isQuad() && !isScBv() && !isScBigUint());
 }
-
 bool AstVar::isScBv() const {
     return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
 }
-
 bool AstVar::isScUint() const {
     return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
 }
-
 bool AstVar::isScBigUint() const {
     return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
 }
@@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
     return NULL;
 }
 
+string AstVar::mtasksString() const {
+    std::ostringstream os;
+    os<<" all: ";
+    for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
+         it != m_mtaskIds.end(); ++it) {
+        os<<*it<<" ";
+    }
+    return os.str();
+}
+
 AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
     // dimension passed from AstArraySel::dimension
     // Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
@@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
         str<<" decl"<<declRange();
     }
 }
+void AstMTaskBody::dump(std::ostream& str) {
+    this->AstNode::dump(str);
+    str<<" ";
+    m_execMTaskp->dump(str);
+}
 void AstTypeTable::dump(std::ostream& str) {
     this->AstNode::dump(str);
     for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
diff --git a/src/V3AstNodes.h b/src/V3AstNodes.h
index fb01664a0..f716ea848 100644
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@@ -1124,6 +1124,7 @@ private:
     bool	m_noSubst:1;	// Do not substitute out references
     bool	m_trace:1;	// Trace this variable
     AstVarAttrClocker m_attrClocker;
+    MTaskIdSet  m_mtaskIds;  // MTaskID's that read or write this var
 
     void	init() {
 	m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
@@ -1323,6 +1324,10 @@ public:
 	if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
     }
     static AstVar* scVarRecurse(AstNode* nodep);
+    void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
+    void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
+    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
+    string mtasksString() const;
 };
 
 class AstDefParam : public AstNode {
@@ -5698,6 +5703,44 @@ public:
     AstNode* bodysp() const { return op1p(); }  // op1= expressions to print
 };
 
+class AstMTaskBody : public AstNode {
+    // Hold statements for each MTask
+private:
+    ExecMTask* m_execMTaskp;
+public:
+    explicit AstMTaskBody(FileLine* flp)
+        : AstNode(flp)
+        , m_execMTaskp(NULL) {}
+    ASTNODE_NODE_FUNCS(MTaskBody);
+    virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
+    AstNode* stmtsp() const { return op1p(); }
+    void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
+    ExecMTask* execMTaskp() const { return m_execMTaskp; }
+    void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
+    virtual void dump(std::ostream& str=std::cout);
+};
+
+class AstExecGraph : public AstNode {
+    // For parallel execution, this node contains a dependency graph.  Each
+    // node in the graph is an ExecMTask, which contains a body for the
+    // mtask, which contains a set of AstActive's, each of which calls a
+    // leaf AstCFunc. whew!
+    //
+    // The mtask bodies are also children of this node, so we can visit
+    // them without traversing the graph (it's not always needed to
+    // traverse the graph.)
+private:
+    V3Graph *m_depGraphp;  // contains ExecMTask's
+public:
+    explicit AstExecGraph(FileLine* fileline);
+    ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
+    virtual ~AstExecGraph();
+    virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
+    const V3Graph* depGraphp() const { return m_depGraphp; }
+    V3Graph* mutableDepGraphp() { return m_depGraphp; }
+    void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
+};
+
 class AstSplitPlaceholder : public AstNode {
 public:
     // Dummy node used within V3Split; never exists outside of V3Split.
@@ -5749,12 +5792,14 @@ private:
     AstTypeTable* m_typeTablep;	// Reference to top type table, for faster lookup
     AstPackage*	  m_dollarUnitPkgp;
     AstCFunc*     m_evalp;      // The '_eval' function
+    AstExecGraph* m_execGraphp;  // Execution MTask graph for threads>1 mode
 public:
     AstNetlist()
 	: AstNode(new FileLine("AstRoot",0))
 	, m_typeTablep(NULL)
 	, m_dollarUnitPkgp(NULL)
-	, m_evalp(NULL) { }
+        , m_evalp(NULL)
+        , m_execGraphp(NULL) { }
     ASTNODE_NODE_FUNCS(Netlist)
     virtual const char* broken() const {
         BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
@@ -5784,6 +5829,8 @@ public:
 	return m_dollarUnitPkgp; }
     AstCFunc* evalp() const { return m_evalp; }
     void evalp(AstCFunc* evalp) { m_evalp = evalp; }
+    AstExecGraph* execGraphp() const { return m_execGraphp; }
+    void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
 };
 
 //######################################################################
diff --git a/src/V3Clock.cpp b/src/V3Clock.cpp
index b38fa9e14..edbaa29a0 100644
--- a/src/V3Clock.cpp
+++ b/src/V3Clock.cpp
@@ -68,6 +68,7 @@ private:
     AstCFunc*		m_settleFuncp;	// Top settlement function we are creating
     AstSenTree*		m_lastSenp;	// Last sensitivity match, so we can detect duplicates.
     AstIf*		m_lastIfp;	// Last sensitivity if active to add more under
+    AstMTaskBody*       m_mtaskBodyp;   // Current mtask body
 
     // METHODS
     VL_DEBUG_FUNC;  // Declare debug()
@@ -338,6 +339,30 @@ private:
 	    // Only empty blocks should be leftover on the non-top.  Killem.
 	    if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
+        } else if (m_mtaskBodyp) {
+            UINFO(4,"  TR ACTIVE  "<<nodep<<endl);
+            AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
+            if (nodep->hasClocked()) {
+                if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
+                if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
+                    UINFO(4,"    sameSenseTree\n");
+                } else {
+                    clearLastSen();
+                    m_lastSenp = nodep->sensesp();
+                    // Make a new if statement
+                    m_lastIfp = makeActiveIf(m_lastSenp);
+                    m_mtaskBodyp->addStmtsp(m_lastIfp);
+                }
+                // Move statements to if
+                m_lastIfp->addIfsp(stmtsp);
+            } else if (nodep->hasInitial() || nodep->hasSettle()) {
+                nodep->v3fatalSrc("MTask should not include initial/settle logic.");
+            } else {
+                // Combo logic. Move statements to mtask func.
+                clearLastSen();
+                m_mtaskBodyp->addStmtsp(stmtsp);
+            }
+            nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	} else {
 	    UINFO(4,"  ACTIVE  "<<nodep<<endl);
 	    AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
@@ -372,6 +397,20 @@ private:
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	}
     }
+    virtual void visit(AstExecGraph* nodep) {
+        for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
+             m_mtaskBodyp;
+             m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
+            clearLastSen();
+            iterate(m_mtaskBodyp);
+        }
+        clearLastSen();
+        // Move the ExecGraph into _eval. Its location marks the
+        // spot where the graph will execute, relative to other
+        // (serial) logic in the cycle.
+        nodep->unlinkFrBack();
+        addToEvalLoop(nodep);
+    }
 
     //--------------------
     // Default: Just iterate
@@ -391,6 +430,7 @@ public:
         m_lastSenp = NULL;
 	m_lastIfp = NULL;
 	m_scopep = NULL;
+        m_mtaskBodyp = NULL;
 	//
         iterate(nodep);
         // Allow downstream modules to find _eval()
diff --git a/src/V3EmitC.cpp b/src/V3EmitC.cpp
index d41fc55a8..9e3a69457 100644
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@@ -34,6 +34,8 @@
 #include "V3EmitC.h"
 #include "V3EmitCBase.h"
 #include "V3Number.h"
+#include "V3PartitionGraph.h"
+#include "V3TSP.h"
 
 #define VL_VALUE_STRING_MAX_WIDTH 8192	// We use a static char array in VL_VALUE_STRING
 
@@ -103,7 +105,13 @@ public:
 	    puts("["+cvtToStr(arrayp->elementsConst())+"]");
 	}
     }
-
+    void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
+        string newVarCmt = varp->mtasksString();
+        if (*curVarCmtp != newVarCmt) {
+            *curVarCmtp = newVarCmt;
+            puts("// Begin mtask footprint "+*curVarCmtp+"\n");
+        }
+    }
     void emitTypedefs(AstNode* firstp) {
 	bool first = true;
 	for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
@@ -783,6 +791,50 @@ public:
     virtual ~EmitCStmts() {}
 };
 
+//######################################################################
+// Establish mtask variable sort order in mtasks mode
+
+class EmitVarTspSorter : public V3TSP::TspStateBase {
+private:
+    // MEMBERS
+    const MTaskIdSet& m_mtaskIds;  // Mtask we're ordering
+    static unsigned m_serialNext;  // Unique ID to establish serial order
+    unsigned m_serial;  // Serial ordering
+public:
+    // CONSTRUCTORS
+    explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
+        : m_mtaskIds(mtaskIds),
+          m_serial(++m_serialNext) {}
+    virtual ~EmitVarTspSorter() {}
+    // METHODS
+    bool operator<(const TspStateBase& other) const {
+        return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
+    }
+    bool operator<(const EmitVarTspSorter& other) const {
+        return m_serial < other.m_serial;
+    }
+    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
+    virtual int cost(const TspStateBase* otherp) const {
+        return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
+    }
+    virtual int cost(const EmitVarTspSorter* otherp) const {
+        int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
+        cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
+        return cost;
+    }
+    // Returns the number of elements in set_a that don't appear in set_b
+    static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
+        int diffs = 0;
+        for (MTaskIdSet::iterator it = set_a.begin();
+             it != set_a.end(); ++it) {
+            if (set_b.find(*it) == set_b.end()) ++diffs;
+        }
+        return diffs;
+    }
+};
+
+unsigned EmitVarTspSorter::m_serialNext = 0;
+
 //######################################################################
 // Internal EmitC implementation
 
@@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
 	return ofp;
     }
 
+    // Returns the number of cross-thread dependencies into mtaskp.
+    // If >0, mtaskp must test whether its prereqs are done before starting,
+    // and may need to block.
+    static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
+        uint32_t result = 0;
+        for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
+            const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
+            if (prevp->thread() != mtaskp->thread()) {
+                ++result;
+            }
+        }
+        return result;
+    }
+
+    void emitMTaskBody(AstMTaskBody* nodep) {
+        ExecMTask* curExecMTaskp = nodep->execMTaskp();
+        if (packedMTaskMayBlock(curExecMTaskp)) {
+            puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
+                 + ".waitUntilUpstreamDone(even_cycle);\n");
+        }
+
+        string recName;
+        if (v3Global.opt.profThreads()) {
+            recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
+            puts("VlProfileRec* " + recName + " = NULL;\n");
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
+            puts(  recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
+            puts(  recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
+            puts(               " "+cvtToStr(curExecMTaskp->id())+ ",");
+            puts(               " "+cvtToStr(curExecMTaskp->cost())+");\n");
+            puts("}\n");
+        }
+        puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
+
+        // The actual body of calls to leaf functions
+        iterateAndNextNull(nodep->stmtsp());
+
+        if (v3Global.opt.profThreads()) {
+            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
+            puts("if (VL_UNLIKELY("+recName+")) {\n");
+            puts(  recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
+            puts("}\n");
+        }
+
+        // Flush message queue
+        puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
+
+        // For any downstream mtask that's on another thread, bump its
+        // counter and maybe notify it.
+        for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
+            if (nextp->thread() != curExecMTaskp->thread()) {
+                puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
+                     + ".signalUpstreamDone(even_cycle);\n");
+            }
+        }
+
+        // Run the next mtask inline
+        const ExecMTask* nextp = curExecMTaskp->packNextp();
+        if (nextp) {
+            emitMTaskBody(nextp->bodyp());
+        } else {
+            // Unblock the fake "final" mtask
+            puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
+        }
+    }
+
+    virtual void visit(AstMTaskBody* nodep) {
+        ExecMTask* mtp = nodep->execMTaskp();
+        puts("\n");
+        puts("void ");
+        puts(modClassName(m_modp)+"::"+mtp->cFuncName());
+        puts("(bool even_cycle, void* symtab) {\n");
+
+        // Declare and set vlSymsp
+        puts(EmitCBaseVisitor::symClassVar() + " = ("
+             + EmitCBaseVisitor::symClassName() + "*)symtab;\n");
+        puts(EmitCBaseVisitor::symTopAssign()+"\n");
+
+        emitMTaskBody(nodep);
+        puts("}\n");
+    }
+
     //---------------------------------------
     // VISITORS
     using EmitCStmts::visit;  // Suppress hidden overloaded virtual function warning
@@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
 	emitVarReset(varp);
     }
 
+    virtual void visit(AstExecGraph* nodep) {
+        if (nodep != v3Global.rootp()->execGraphp()) {
+            nodep->v3fatalSrc("ExecGraph should be a singleton!");
+        }
+        // The location of the AstExecGraph within the containing _eval()
+        // function is where we want to invoke the graph and wait for it to
+        // complete. Do that now.
+        //
+        // Don't recurse to children -- this isn't the place to emit
+        // function definitions for the nested CFuncs. We'll do that at the
+        // end.
+        puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
+
+        // Build the list of initial mtasks to start
+        std::vector<const ExecMTask*> execMTasks;
+
+        // Start each root mtask
+        for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
+            if (etp->threadRoot()) execMTasks.push_back(etp);
+        }
+        if (execMTasks.size() >
+            static_cast<unsigned>(v3Global.opt.threads())) {
+            nodep->v3fatalSrc("More root mtasks than available threads");
+        }
+
+        if (!execMTasks.empty()) {
+            for (uint32_t i = 0; i < execMTasks.size(); ++i) {
+                bool runInline = (i == execMTasks.size() - 1);
+                if (runInline) {
+                    // The thread calling eval() will run this mtask inline,
+                    // along with its packed successors.
+                    puts(execMTasks[i]->cFuncName()
+                         + "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
+                    puts("Verilated::mtaskId(0);\n");
+                } else {
+                    // The other N-1 go to the thread pool.
+                    puts("vlTOPp->__Vm_threadPoolp->workerp("
+                         + cvtToStr(i)+")->addTask("
+                         + execMTasks[i]->cFuncName()
+                         + ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
+                }
+            }
+            puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
+        }
+    }
+
     //---------------------------------------
     // ACCESSORS
 
@@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
     void emitStaticDecl(AstNodeModule* modp);
     void emitSettleLoop(const std::string& eval_call, bool initial);
     void emitWrapEval(AstNodeModule* modp);
+    void emitMTaskState();
+    void emitMTaskVertexCtors(bool* firstp);
     void emitInt(AstNodeModule* modp);
     void maybeSplit(AstNodeModule* modp);
 
@@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
     }
 }
 
+void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
+    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
+    const V3Graph* depGraphp = execGraphp->depGraphp();
+
+    unsigned finalEdgesInCt = 0;
+    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+        unsigned edgesInCt = packedMTaskMayBlock(mtp);
+        if (packedMTaskMayBlock(mtp) > 0) {
+            emitCtorSep(firstp);
+            puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
+        }
+        // Each mtask with no packed successor will become a dependency
+        // for the final node:
+        if (!mtp->packNextp()) ++finalEdgesInCt;
+    }
+
+    emitCtorSep(firstp);
+    puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
+
+    // This will flip to 'true' before the start of the 0th cycle.
+    emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
+    if (v3Global.opt.profThreads()) {
+        emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
+    }
+    emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
+}
+
 void EmitCImp::emitCtorImp(AstNodeModule* modp) {
     puts("\n");
     bool first = true;
@@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
         first = false;  // VL_CTOR_IMP includes the first ':'
     }
     emitVarCtors(&first);
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        emitMTaskVertexCtors(&first);
+    }
     puts(" {\n");
     emitCellCtors(modp);
     emitSensitives();
@@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
     putsDecoration("// Reset structure values\n");
     puts("_ctor_var_reset();\n");
     emitTextSection(AstType::atScCtor);
+
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        // TODO-- For now each top module creates its own ThreadPool here,
+        // and deletes it in the destructor. If A and B are each top level
+        // modules, each creates a separate thread pool.  This allows
+        // A.eval() and B.eval() to run concurrently without any
+        // interference -- so long as the physical machine has enough cores
+        // to support both pools and all testbench threads.
+        //
+        // In the future, we might want to let the client provide a
+        // threadpool to the constructor. This would allow two or more
+        // models to share a single threadpool.
+        //
+        // For example: suppose models A and B are each compiled to run on
+        // 4 threads. The client might create a single thread pool with 3
+        // threads and pass it to both models. If the client can ensure tht
+        // A.eval() and B.eval() do NOT run concurrently, there will be no
+        // contention for the threads. This mode is missing for now.  (Is
+        // there demand for such a setup?)
+        puts("__Vm_threadPoolp = new VlThreadPool("
+             // Note we create N-1 threads in the thread pool. The thread
+             // that calls eval() becomes the final Nth thread for the
+             // duration of the eval call.
+             + cvtToStr(v3Global.opt.threads() - 1)
+             + ", " + cvtToStr(v3Global.opt.profThreads())
+             + ");\n");
+
+        if (v3Global.opt.profThreads()) {
+            puts("__Vm_profile_cycle_start = 0;\n");
+            puts("__Vm_profile_time_finished = 0;\n");
+            puts("__Vm_profile_window_ct = 0;");
+        }
+    }
     puts("}\n");
 }
 
@@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
 void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
     puts("\n");
     puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
+    }
     emitTextSection(AstType::atScDtor);
     if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
     puts("}\n");
@@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
     if (v3Global.opt.threads() == 1) {
 	uint32_t mtaskId = 0;
 	putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
-	puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
+        puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
 	puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
     }
+
+    if (v3Global.opt.mtasks()
+        && v3Global.opt.profThreads()) {
+        puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
+        puts(                 " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
+        puts(                 " && (Verilated::profThreadsWindow() >= 1))) {\n");
+        // Within a profile (either starting, middle, or end)
+        puts(    "if (vlTOPp->__Vm_profile_window_ct == 0) {\n");  // Opening file?
+        // Start profile on this cycle. We'll capture a window worth, then
+        // only analyze the next window worth. The idea is that the first window
+        // capture will hit some cache-cold stuff (eg printf) but it'll be warm
+        // by the time we hit the second window, we hope.
+        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
+        // "* 2" as first half is warmup, second half is collection
+        puts(        "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
+        puts(    "}\n");
+        puts(    "--vlTOPp->__Vm_profile_window_ct;\n");
+        puts(    "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
+        // This barrier record in every threads' profile demarcates the
+        // cache-warm-up cycles before the barrier from the actual profile
+        // cycles afterward.
+        puts(        "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
+        puts(                       "VlProfileRec(VlProfileRec::Barrier()));\n");
+        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
+        puts(    "}\n");
+        puts(    "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
+        // Ending file.
+        puts(        "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
+        puts(        "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
+        // This turns off the test to enter the profiling code, but still
+        // allows the user to collect another profile by changing
+        // profThreadsStart
+        puts(        "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
+        puts(        "vlTOPp->__Vm_profile_cycle_start = 0;\n");
+        puts(    "}\n");
+        puts("}\n");
+    }
+
     emitSettleLoop(
         (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
          + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
@@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
     // Put out a list of signal declarations
     // in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
     // This aids cache packing and locality
-    // Largest->smallest reduces the number of pad variables.
-    // But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
-    // TODO: Move this sort to an earlier visitor stage.
     //
+    // Largest->smallest reduces the number of pad variables.  Also
+    // experimented with alternating between large->small and small->large
+    // on successive Mtask groups, but then when a new mtask gets added may
+    // cause a huge delta.
+    //
+    // TODO: Move this sort to an earlier visitor stage.
     VarSortMap varAnonMap;
     VarSortMap varNonanonMap;
 
@@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
 
 void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
     UASSERT(sortedp->empty(), "Sorted should be initially empty");
-    {
-        // Plain old serial mode. Sort by size, from small to large.
+    if (!v3Global.opt.mtasks()) {
+        // Plain old serial mode. Sort by size, from small to large,
+        // to optimize for both packing and small offsets in code.
         for (VarSortMap::const_iterator it = vmap.begin();
              it != vmap.end(); ++it) {
             for (VarVec::const_iterator jt = it->second.begin();
@@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
                 sortedp->push_back(*jt);
             }
         }
+        return;
+    }
+
+    // MacroTask mode.  Sort by MTask-affinity group first, size second.
+    typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
+    MTaskVarSortMap m2v;
+    for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
+        int size_class = it->first;
+        const VarVec& vec = it->second;
+        for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
+            const AstVar* varp = *jt;
+            m2v[varp->mtaskIds()][size_class].push_back(varp);
+        }
+    }
+
+    // Create a TSP sort state for each MTaskIdSet footprint
+    V3TSP::StateVec states;
+    for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
+        states.push_back(new EmitVarTspSorter(it->first));
+    }
+
+    // Do the TSP sort
+    V3TSP::StateVec sorted_states;
+    V3TSP::tspSort(states, &sorted_states);
+
+    for (V3TSP::StateVec::iterator it = sorted_states.begin();
+         it != sorted_states.end(); ++it) {
+        const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
+        const VarSortMap& localVmap = m2v[statep->mtaskIds()];
+        // use rbegin/rend to sort size large->small
+        for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
+             jt != localVmap.rend(); ++jt) {
+            const VarVec& vec = jt->second;
+            for (VarVec::const_iterator kt = vec.begin();
+                 kt != vec.end(); ++kt) {
+                sortedp->push_back(*kt);
+            }
+        }
+        delete statep; VL_DANGLING(statep);
     }
 }
 
 void EmitCStmts::emitSortedVarList(const VarVec& anons,
                                    const VarVec& nonanons,
                                    const string& prefixIfImp) {
+    string curVarCmt = "";
     // Output anons
     {
         int anonMembers = anons.size();
@@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
                     if (anonL1s != 1) puts("struct {\n");
                     for (int l0=0; l0<lim && it != anons.end(); ++l0) {
                         const AstVar* varp = *it;
+                        emitVarCmtChg(varp, &curVarCmt);
                         emitVarDecl(varp, prefixIfImp);
                         ++it;
                     }
@@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
         // Leftovers, just in case off by one error somewhere above
         for (; it != anons.end(); ++it) {
             const AstVar* varp = *it;
+            emitVarCmtChg(varp, &curVarCmt);
             emitVarDecl(varp, prefixIfImp);
         }
     }
     // Output nonanons
     for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
         const AstVar* varp = *it;
+        emitVarCmtChg(varp, &curVarCmt);
         emitVarDecl(varp, prefixIfImp);
     }
 }
@@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
 	    if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
 	}
     }
+
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        // Emit the mtask func prototypes.
+        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+        if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
+        const V3Graph* depGraphp = execGraphp->depGraphp();
+        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+            if (mtp->threadRoot()) {
+                // Emit function declaration for this mtask
+                ofp()->putsPrivate(true);
+                puts("static void "); puts(mtp->cFuncName());
+                puts("(bool even_cycle, void* symtab);\n");
+            }
+        }
+        // No AstCFunc for this one, as it's synthetic. Just write it:
+        puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
+    }
+}
+
+void EmitCImp::emitMTaskState() {
+    ofp()->putsPrivate(true);
+    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
+
+    const V3Graph* depGraphp = execGraphp->depGraphp();
+    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
+        if (packedMTaskMayBlock(mtp) > 0) {
+            puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
+        }
+    }
+    // This fake mtask depends on all the real ones.  We use it to block
+    // eval() until all mtasks are done.
+    //
+    // In the future we might allow _eval() to return before the graph is
+    // fully done executing, for "half wave" scheduling. For now we wait
+    // for all mtasks though.
+    puts("VlMTaskVertex __Vm_mt_final;\n");
+    puts("VlThreadPool* __Vm_threadPoolp;\n");
+
+    if (v3Global.opt.profThreads()) {
+        // rdtsc() at current cycle start
+        puts("vluint64_t __Vm_profile_cycle_start;\n");
+        // Time we finished analysis
+        puts("vluint64_t __Vm_profile_time_finished;\n");
+        // Track our position in the cache warmup and actual profile window
+        puts("vluint32_t __Vm_profile_window_ct;\n");
+    }
+
+    puts("bool __Vm_even_cycle;\n");
 }
 
 void EmitCImp::emitInt(AstNodeModule* modp) {
@@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
     } else {
 	puts("#include \"verilated.h\"\n");
     }
+    if (v3Global.opt.mtasks()) {
+        puts("#include \"verilated_threads.h\"\n");
+    }
     if (v3Global.opt.savable()) {
 	puts("#include \"verilated_save.h\"\n");
     }
@@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
 	    puts("bool __Vm_inhibitSim;  ///< Set true to disable evaluation of module\n");
 	}
     }
+    if (modp->isTop() && v3Global.opt.mtasks()) {
+        emitMTaskState();
+    }
     emitCoverageDecl(modp);	// may flip public/private
 
     puts("\n// PARAMETERS\n");
@@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
 	}
     }
 
+    if (fast && modp->isTop() && v3Global.opt.mtasks()) {
+        // Make a final pass and emit function definitions for the mtasks
+        // in the ExecGraph
+        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+        const V3Graph* depGraphp = execGraphp->depGraphp();
+        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
+            if (mtaskp->threadRoot()) {
+                maybeSplit(modp);
+                // Only define one function for all the mtasks packed on
+                // a given thread. We'll name this function after the
+                // root mtask though it contains multiple mtasks' worth
+                // of logic.
+                iterate(mtaskp->bodyp());
+            }
+        }
+    }
     delete m_ofp; m_ofp=NULL;
 }
 
diff --git a/src/V3EmitMk.cpp b/src/V3EmitMk.cpp
index a7f7908ff..a3ba685be 100644
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@@ -94,6 +94,9 @@ public:
 			    putMakeClassEntry(of, "verilated_vcd_sc.cpp");
 			}
 		    }
+                    if (v3Global.opt.mtasks()) {
+                        putMakeClassEntry(of, "verilated_threads.cpp");
+                    }
 		}
 		else if (support==2 && slow) {
 		}
diff --git a/src/V3Error.h b/src/V3Error.h
index d6b3720b1..ecb12dce5 100644
--- a/src/V3Error.h
+++ b/src/V3Error.h
@@ -131,7 +131,7 @@ public:
 	    "ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
 	    "BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
 	    "CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
-	    "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
+            "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
 	    "ENDLABEL", "GENCLK",
 	    "IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
             "INCABSPATH", "INFINITELOOP", "INITIALDLY",
diff --git a/src/V3LifePost.cpp b/src/V3LifePost.cpp
index 25ef5027c..e55fa64e0 100644
--- a/src/V3LifePost.cpp
+++ b/src/V3LifePost.cpp
@@ -37,6 +37,8 @@
 #include VL_INCLUDE_UNORDERED_MAP
 
 #include "V3Global.h"
+#include "V3PartitionGraph.h"
+#include "V3GraphPathChecker.h"
 #include "V3LifePost.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
@@ -78,6 +80,11 @@ private:
             iterate(nodep->funcp());
         }
     }
+    virtual void visit(AstExecGraph* nodep) {
+        // Can just iterate across the MTask bodies in any order.  Order
+        // isn't important for LifePostElimVisitor's simple substitution.
+        iterateChildren(nodep);
+    }
     virtual void visit(AstCFunc* nodep) {
         if (!m_tracingCall && !nodep->entryPoint()) return;
         m_tracingCall = false;
@@ -101,11 +108,17 @@ public:
 // and a sequence number within the mtask:
 
 struct LifeLocation {
+    const ExecMTask* mtaskp;
     uint32_t sequence;
 public:
-    LifeLocation() : sequence(0) {}
-    LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
+    LifeLocation() : mtaskp(NULL), sequence(0) {}
+    LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
+        : mtaskp(mtaskp_), sequence(sequence_) {}
     bool operator< (const LifeLocation& b) const {
+        unsigned a_id = mtaskp ? mtaskp->id() : 0;
+        unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
+        if (a_id < b_id) { return true; }
+        if (b_id < a_id) { return false; }
         return sequence < b.sequence;
     }
 };
@@ -130,6 +143,9 @@ private:
 
     // STATE
     uint32_t            m_sequence;     // Sequence number of assigns/varrefs,
+    //                                  // local to the current MTask.
+    const ExecMTask*    m_execMTaskp;   // Current ExecMTask being processed,
+    //                                  // or NULL for serial code.
     V3Double0           m_statAssnDel;  // Statistic tracking
     bool                m_tracingCall;  // Currently tracing a CCall to a CFunc
 
@@ -143,11 +159,15 @@ private:
     typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
     PostLocMap          m_assignposts;  // AssignPost dly var locations
 
+    const V3Graph*      m_mtasksGraphp;  // Mtask tracking graph
+    vl_unique_ptr<GraphPathChecker> m_checker;
+
     // METHODS
     VL_DEBUG_FUNC;  // Declare debug()
 
-    static bool before(const LifeLocation& a, const LifeLocation& b) {
-        return a.sequence < b.sequence;
+    bool before(const LifeLocation& a, const LifeLocation& b) {
+        if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
+        return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
     }
     bool outsideCriticalArea(LifeLocation loc,
                              const std::set<LifeLocation>& dlyVarAssigns,
@@ -159,6 +179,13 @@ private:
         // Otherwise, loc could fall in the "critical" area where the
         // substitution affects the result of the operation at loc, so
         // return false.
+        if (!loc.mtaskp && assignPostLoc.mtaskp) {
+            // This is threaded mode; 'loc' is something that happens at
+            // initial/settle time, or perhaps in _eval() but outside of
+            // the mtask graph.
+            // In either case, it's not in the critical area.
+            return true;
+        }
         if (before(assignPostLoc, loc)) return true;
         for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
              it != dlyVarAssigns.end(); ++it) {
@@ -239,6 +266,17 @@ private:
         // within the mtask) where each varscope is read, and written.
         iterateChildren(nodep);
 
+        if (v3Global.opt.mtasks()) {
+            if (!m_mtasksGraphp) {
+                nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
+            }
+            m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
+        } else {
+            if (m_mtasksGraphp) {
+                nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
+            }
+        }
+
         // Find all assignposts. Determine which ones can be
         // eliminated. Remove those, and mark their dly vars' user4 field
         // to indicate we should replace these dly vars with their original
@@ -252,7 +290,8 @@ private:
         // Consumption/generation of a variable,
         AstVarScope* vscp = nodep->varScopep();
         if (!vscp) nodep->v3fatalSrc("Scope not assigned");
-        LifeLocation loc(++m_sequence);
+
+        LifeLocation loc(m_execMTaskp, ++m_sequence);
         if (nodep->lvalue()) {
             m_writes[vscp].insert(loc);
         } else {
@@ -275,7 +314,7 @@ private:
             if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
                 nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
             }
-            LifeLocation loc(++m_sequence);
+            LifeLocation loc(m_execMTaskp, ++m_sequence);
             m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
         }
     }
@@ -291,6 +330,18 @@ private:
             iterate(nodep->funcp());
         }
     }
+    virtual void visit(AstExecGraph* nodep) {
+        // Treat the ExecGraph like a call to each mtask body
+        m_mtasksGraphp = nodep->depGraphp();
+        for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
+             mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
+            ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
+            m_execMTaskp = mtaskp;
+            m_sequence = 0;
+            iterate(mtaskp->bodyp());
+        }
+        m_execMTaskp = NULL;
+    }
     virtual void visit(AstCFunc* nodep) {
         if (!m_tracingCall && !nodep->entryPoint()) return;
         m_tracingCall = false;
@@ -305,7 +356,9 @@ public:
     // CONSTRUCTORS
     explicit LifePostDlyVisitor(AstNetlist* nodep)
         : m_sequence(0)
-        , m_tracingCall(false) {
+        , m_execMTaskp(NULL)
+        , m_tracingCall(false)
+        , m_mtasksGraphp(NULL) {
         iterate(nodep);
     }
     virtual ~LifePostDlyVisitor() {
diff --git a/src/V3Options.cpp b/src/V3Options.cpp
index e82433884..9dc901a51 100644
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-debug-abort") )		{ abort(); } // Undocumented, see also --debug-sigsegv
 	    else if ( onoff   (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
             else if ( onoff   (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
+            else if ( onoff   (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
+            else if ( onoff   (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; }  // Undocumented
+            else if ( onoff   (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; }  // Undocumented
 	    else if ( !strcmp (sw, "-debug-sigsegv") )		{ throwSigsegv(); }  // Undocumented, see also --debug-abort
 	    else if ( !strcmp (sw, "-debug-fatalsrc") )		{ v3fatalSrc("--debug-fatal-src"); }  // Undocumented, see also --debug-abort
 	    else if ( onoff   (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
@@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-private") )		{ m_public = false; }
             else if ( onoff   (sw, "-prof-cfuncs", flag/*ref*/) )       { m_profCFuncs = flag; }
             else if ( onoff   (sw, "-profile-cfuncs", flag/*ref*/) )    { m_profCFuncs = flag; }  // Undocumented, for backward compat
+            else if ( onoff   (sw, "-prof-threads", flag/*ref*/) )      { m_profThreads = flag; }
 	    else if ( onoff   (sw, "-public", flag/*ref*/) )		{ m_public = flag; }
             else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+")))	{ addParameter(string(sw+strlen("-pvalue+")), false); }
             else if ( onoff   (sw, "-relative-cfuncs", flag/*ref*/) )   { m_relativeCFuncs = flag; }
@@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( onoff   (sw, "-stats", flag/*ref*/) )		{ m_stats = flag; }
 	    else if ( onoff   (sw, "-stats-vars", flag/*ref*/) )	{ m_statsVars = flag; m_stats |= flag; }
 	    else if ( !strcmp (sw, "-sv") )				{ m_defaultLanguage = V3LangCode::L1800_2005; }
+            else if ( onoff   (sw, "-threads-coarsen", flag/*ref*/))    { m_threadsCoarsen = flag; }  // Undocumented, debug
 	    else if ( onoff   (sw, "-trace", flag/*ref*/) )		{ m_trace = flag; }
 	    else if ( onoff   (sw, "-trace-dups", flag/*ref*/) )	{ m_traceDups = flag; }
 	    else if ( onoff   (sw, "-trace-params", flag/*ref*/) )	{ m_traceParams = flag; }
@@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 		shift; m_threads = atoi(argv[i]);
 		if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
 	    }
+            else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
+                shift;
+                if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
+                else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
+                else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
+                else {
+                    fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
+                }
+            }
+            else if ( !strcmp (sw, "-threads-max-mtasks") ) {
+                shift; m_threadsMaxMTasks = atoi(argv[i]);
+                if (m_threadsMaxMTasks < 1)
+                    fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
+            }
 	    else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
 		shift; m_topModule = argv[i];
 	    }
@@ -1223,6 +1242,9 @@ V3Options::V3Options() {
     m_coverageUser = false;
     m_debugCheck = false;
     m_debugLeak = true;
+    m_debugNondeterminism = false;
+    m_debugPartition = false;
+    m_debugSelfTest = false;
     m_decoration = true;
     m_exe = false;
     m_ignc = false;
@@ -1237,6 +1259,7 @@ V3Options::V3Options() {
     m_pinsScBigUint = false;
     m_pinsUint8 = false;
     m_profCFuncs = false;
+    m_profThreads = false;
     m_preprocOnly = false;
     m_preprocNoLine = false;
     m_public = false;
@@ -1249,6 +1272,10 @@ V3Options::V3Options() {
     m_statsVars = false;
     m_systemC = false;
     m_threads = 0;
+    m_threadsDpiPure = true;
+    m_threadsDpiUnpure = false;
+    m_threadsCoarsen = true;
+    m_threadsMaxMTasks = 0;
     m_trace = false;
     m_traceDups = false;
     m_traceParams = true;
diff --git a/src/V3Options.h b/src/V3Options.h
index c00ecff3c..db05a6137 100644
--- a/src/V3Options.h
+++ b/src/V3Options.h
@@ -75,7 +75,10 @@ class V3Options {
     bool	m_coverageUnderscore;// main switch: --coverage-underscore
     bool	m_coverageUser;	// main switch: --coverage-func
     bool	m_debugCheck;	// main switch: --debug-check
-    bool        m_debugLeak;   // main switch: --debug-leak
+    bool        m_debugLeak;    // main switch: --debug-leak
+    bool        m_debugNondeterminism;  // main switch: --debug-nondeterminism
+    bool        m_debugPartition;  // main switch: --debug-partition
+    bool        m_debugSelfTest;  // main switch: --debug-self-test
     bool	m_decoration;	// main switch: --decoration
     bool	m_exe;		// main switch: --exe
     bool	m_ignc;		// main switch: --ignc
@@ -87,6 +90,7 @@ class V3Options {
     bool	m_pinsScBigUint;// main switch: --pins-sc-biguint
     bool	m_pinsUint8;	// main switch: --pins-uint8
     bool        m_profCFuncs;   // main switch: --prof-cfuncs
+    bool        m_profThreads;  // main switch: --prof-threads
     bool	m_public;	// main switch: --public
     bool	m_relativeCFuncs; // main switch: --relative-cfuncs
     bool	m_relativeIncludes; // main switch: --relative-includes
@@ -96,6 +100,9 @@ class V3Options {
     bool	m_skipIdentical;// main switch: --skip-identical
     bool	m_stats;	// main switch: --stats
     bool	m_statsVars;	// main switch: --stats-vars
+    bool        m_threadsCoarsen;  // main switch: --threads-coarsen
+    bool        m_threadsDpiPure;  // main switch: --threads-dpi all/pure
+    bool        m_threadsDpiUnpure;  // main switch: --threads-dpi all
     bool	m_trace;	// main switch: --trace
     bool	m_traceDups;	// main switch: --trace-dups
     bool	m_traceParams;	// main switch: --trace-params
@@ -117,6 +124,7 @@ class V3Options {
     int		m_outputSplitCTrace;// main switch: --output-split-ctrace
     int		m_pinsBv;	// main switch: --pins-bv
     int		m_threads;	// main switch: --threads (0 == --no-threads)
+    int         m_threadsMaxMTasks;  // main switch: --threads-max-mtasks
     int		m_traceDepth;	// main switch: --trace-depth
     int		m_traceMaxArray;// main switch: --trace-max-array
     int		m_traceMaxWidth;// main switch: --trace-max-width
@@ -232,8 +240,14 @@ class V3Options {
     bool coverageUser() const { return m_coverageUser; }
     bool debugCheck() const { return m_debugCheck; }
     bool debugLeak() const { return m_debugLeak; }
+    bool debugNondeterminism() const { return m_debugNondeterminism; }
+    bool debugPartition() const { return m_debugPartition; }
+    bool debugSelfTest() const { return m_debugSelfTest; }
     bool decoration() const { return m_decoration; }
     bool exe() const { return m_exe; }
+    bool threadsDpiPure() const { return m_threadsDpiPure; }
+    bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
+    bool threadsCoarsen() const { return m_threadsCoarsen; }
     bool trace() const { return m_trace; }
     bool traceDups() const { return m_traceDups; }
     bool traceParams() const { return m_traceParams; }
@@ -246,6 +260,7 @@ class V3Options {
     bool pinsScBigUint() const { return m_pinsScBigUint; }
     bool pinsUint8() const { return m_pinsUint8; }
     bool profCFuncs() const { return m_profCFuncs; }
+    bool profThreads() const { return m_profThreads; }
     bool allPublic() const { return m_public; }
     bool lintOnly() const { return m_lintOnly; }
     bool ignc() const { return m_ignc; }
@@ -267,6 +282,7 @@ class V3Options {
     int	   outputSplitCTrace() const { return m_outputSplitCTrace; }
     int	   pinsBv() const { return m_pinsBv; }
     int threads() const { return m_threads; }
+    int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
     bool mtasks() const { return (m_threads > 1); }
     int	   traceDepth() const { return m_traceDepth; }
     int	   traceMaxArray() const { return m_traceMaxArray; }
diff --git a/src/V3Order.cpp b/src/V3Order.cpp
index a52c0bb55..c6b1fc5d2 100644
--- a/src/V3Order.cpp
+++ b/src/V3Order.cpp
@@ -89,19 +89,22 @@
 #include <sstream>
 #include <memory>
 
-#include "V3Global.h"
-#include "V3File.h"
 #include "V3Ast.h"
+#include "V3Const.h"
+#include "V3EmitCBase.h"
+#include "V3EmitV.h"
+#include "V3File.h"
+#include "V3Global.h"
 #include "V3Graph.h"
+#include "V3GraphStream.h"
 #include "V3List.h"
+#include "V3Partition.h"
+#include "V3PartitionGraph.h"
 #include "V3SenTree.h"
 #include "V3Stats.h"
-#include "V3EmitCBase.h"
-#include "V3Const.h"
 
 #include "V3Order.h"
 #include "V3OrderGraph.h"
-#include "V3EmitV.h"
 
 #include VL_INCLUDE_UNORDERED_MAP
 #include VL_INCLUDE_UNORDERED_SET
@@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
     // OrderVisitor. It produces a slightly coarsened graph to drive the
     // code scheduling.
     //
-    // * The new graph contains nodes of type OrderMoveVertex.
+    // * For the serial code scheduler, the new graph contains
+    //   nodes of type OrderMoveVertex.
+    //
+    // * For the threaded code scheduler, the new graph contains
+    //   nodes of type MTaskMoveVertex.
     //
     // * The difference in output type is abstracted away by the
-    //   'T_MoveVertex' template parameter.
+    //   'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
+    //   works the same way for both cases.
 
     // TYPES
     typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
@@ -563,7 +571,7 @@ private:
 };
 
 //######################################################################
-// OrderMoveVertexMaker
+// OrderMoveVertexMaker and related
 
 class OrderMoveVertexMaker
     : public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
@@ -595,6 +603,64 @@ private:
     VL_UNCOPYABLE(OrderMoveVertexMaker);
 };
 
+class OrderMTaskMoveVertexMaker
+    : public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
+    V3Graph* m_pomGraphp;
+public:
+    explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
+        : m_pomGraphp(pomGraphp) {}
+    MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
+                                 const OrderEitherVertex* varVertexp,
+                                 const AstScope* scopep,
+                                 const AstSenTree* domainp) {
+        // Exclude initial/settle logic from the mtasks graph.
+        // We'll output time-zero logic separately.
+        if (domainp->hasInitial() || domainp->hasSettle()) {
+            return NULL;
+        }
+        return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
+    }
+    void freeVertexp(MTaskMoveVertex* freeMep) {
+        freeMep->unlinkDelete(m_pomGraphp);
+    }
+private:
+    VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
+};
+
+class OrderVerticesByDomainThenScope {
+    PartPtrIdMap m_ids;
+public:
+    virtual bool operator()(const V3GraphVertex* lhsp,
+                            const V3GraphVertex* rhsp) const {
+        const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
+        const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
+        vluint64_t l_id = m_ids.findId(l_vxp->domainp());
+        vluint64_t r_id = m_ids.findId(r_vxp->domainp());
+        if (l_id < r_id) return true;
+        if (l_id > r_id) return false;
+        l_id = m_ids.findId(l_vxp->scopep());
+        r_id = m_ids.findId(r_vxp->scopep());
+        return l_id < r_id;
+    }
+};
+
+class MTaskVxIdLessThan {
+public:
+    MTaskVxIdLessThan() {}
+    virtual ~MTaskVxIdLessThan() {}
+
+    // Sort vertex's, which must be AbstractMTask's, into a deterministic
+    // order by comparing their serial IDs.
+    virtual bool operator()(const V3GraphVertex* lhsp,
+                            const V3GraphVertex* rhsp) const {
+        const AbstractMTask* lmtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(lhsp);
+        const AbstractMTask* rmtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(rhsp);
+        return lmtaskp->id() < rmtaskp->id();
+    }
+};
+
 //######################################################################
 // Order class functions
 
@@ -701,6 +767,7 @@ private:
     void processDomainsIterate(OrderEitherVertex* vertexp);
     void processEdgeReport();
 
+    // processMove* routines schedule serial execution
     void processMove();
     void processMoveClear();
     void processMoveBuildGraph();
@@ -711,6 +778,18 @@ private:
     AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
                                    AstCFunc*& newFuncpr, int& newStmtsr);
 
+    // processMTask* routines schedule threaded execution
+    struct MTaskState {
+        typedef std::list<const OrderLogicVertex*> Logics;
+        AstMTaskBody* m_mtaskBodyp;
+        Logics m_logics;
+        ExecMTask* m_execMTaskp;
+        MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
+    };
+    void processMTasks();
+    typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
+    void processMTasksInitial(InitialLogicE logic_type);
+
     string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
 	modp->user3Inc();
 	int funcnum = modp->user3();
@@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
     return activep;
 }
 
+void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
+    // Emit initial/settle logic. Initial blocks won't be part of the
+    // mtask partition, aren't eligible for parallelism.
+    //
+    int initStmts = 0;
+    AstCFunc* initCFunc = NULL;
+    AstScope* lastScopep = NULL;
+    for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
+         initVxp; initVxp = initVxp->verticesNextp()) {
+        OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
+        if (!initp) continue;
+        if ((logic_type == LOGIC_INITIAL)
+            && !initp->domainp()->hasInitial()) continue;
+        if ((logic_type == LOGIC_SETTLE)
+            && !initp->domainp()->hasSettle()) continue;
+        if (initp->scopep() != lastScopep) {
+            // Start new cfunc, don't let the cfunc cross scopes
+            initCFunc = NULL;
+            lastScopep = initp->scopep();
+        }
+        AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
+        if (newActivep) m_scopetopp->addActivep(newActivep);
+    }
+}
+
+void OrderVisitor::processMTasks() {
+    // For nondeterminism debug:
+    V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
+
+    processMTasksInitial(LOGIC_INITIAL);
+    processMTasksInitial(LOGIC_SETTLE);
+
+    // We already produced a graph of every var, input, logic, and settle
+    // block and all dependencies; this is 'm_graph'.
+    //
+    // Now, starting from m_graph, make a slightly-coarsened graph representing
+    // only logic, and discarding edges we know we can ignore.
+    // This is quite similar to the 'm_pomGraph' of the serial code gen:
+    V3Graph logicGraph;
+    OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
+    ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
+        &m_graph, &logicGraph, &create_mtask_vertex);
+    mtask_pmbg.build();
+
+    // Needed? We do this for m_pomGraph in serial mode, so do it here too:
+    logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
+
+    // Partition logicGraph into LogicMTask's. The partitioner will annotate
+    // each vertex in logicGraph with a 'color' which is really an mtask ID
+    // in this context.
+    V3Partition partitioner(&logicGraph);
+    V3Graph mtasks;
+    partitioner.go(&mtasks);
+
+    vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
+
+    // Iterate through the entire logicGraph. For each logic node,
+    // attach it to a per-MTask ordered list of logic nodes.
+    // This is the order we'll execute logic nodes within the MTask.
+    //
+    // MTasks may span scopes and domains, so sort by both here:
+    GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
+    const V3GraphVertex* moveVxp;
+    while ((moveVxp = emit_logic.nextp())) {
+        const MTaskMoveVertex* movep =
+            dynamic_cast<const MTaskMoveVertex*>(moveVxp);
+        unsigned mtaskId = movep->color();
+        UASSERT(mtaskId > 0,
+                "Every MTaskMoveVertex should have an mtask assignment >0");
+        if (movep->logicp()) {
+            // Add this logic to the per-mtask order
+            mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
+
+            // Since we happen to be iterating over every logic node,
+            // take this opportunity to annotate each AstVar with the id's
+            // of mtasks that consume it and produce it. We'll use this
+            // information in V3EmitC when we lay out var's in memory.
+            const OrderLogicVertex* logicp = movep->logicp();
+            for (const V3GraphEdge* edgep = logicp->inBeginp();
+                 edgep; edgep = edgep->inNextp()) {
+                const OrderVarVertex* pre_varp =
+                    dynamic_cast<const OrderVarVertex*>(edgep->fromp());
+                if (!pre_varp) continue;
+                AstVar* varp = pre_varp->varScp()->varp();
+                // varp depends on logicp, so logicp produces varp,
+                // and vice-versa below
+                varp->addProducingMTaskId(mtaskId);
+            }
+            for (const V3GraphEdge* edgep = logicp->outBeginp();
+                 edgep; edgep = edgep->outNextp()) {
+                const OrderVarVertex* post_varp
+                    = dynamic_cast<const OrderVarVertex*>(edgep->top());
+                if (!post_varp) continue;
+                AstVar* varp = post_varp->varScp()->varp();
+                varp->addConsumingMTaskId(mtaskId);
+            }
+            // TODO? We ignore IO vars here, so those will have empty mtask
+            // signatures. But we could also give those mtask signatures.
+        }
+    }
+
+    // Create the AstExecGraph node which represents the execution
+    // of the MTask graph.
+    FileLine* rootFlp = new FileLine("AstRoot", 0);
+    AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
+    m_scopetopp->addActivep(execGraphp);
+    v3Global.rootp()->execGraphp(execGraphp);
+
+    // Create CFuncs and bodies for each MTask.
+    GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
+    const V3GraphVertex* mtaskVxp;
+    while ((mtaskVxp = emit_mtasks.nextp())) {
+        const AbstractLogicMTask* mtaskp =
+            dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
+
+        // Create a body for this mtask
+        AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
+        MTaskState& state = mtaskStates[mtaskp->id()];
+        state.m_mtaskBodyp = bodyp;
+
+        // Create leaf CFunc's to run this mtask's logic,
+        // and create a set of AstActive's to call those CFuncs.
+        // Add the AstActive's into the AstMTaskBody.
+        const AstSenTree* last_domainp = NULL;
+        AstCFunc* leafCFuncp = NULL;
+        int leafStmts = 0;
+        for (MTaskState::Logics::iterator it = state.m_logics.begin();
+             it != state.m_logics.end(); ++it) {
+            const OrderLogicVertex* logicp = *it;
+            if (logicp->domainp() != last_domainp) {
+                // Start a new leaf function.
+                leafCFuncp = NULL;
+            }
+            last_domainp = logicp->domainp();
+
+            AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
+            if (newActivep) bodyp->addStmtsp(newActivep);
+        }
+
+        // Translate the LogicMTask graph into the corresponding ExecMTask
+        // graph, which will outlive V3Order and persist for the remainder
+        // of verilator's processing.
+        // - The LogicMTask graph points to MTaskMoveVertex's
+        //   and OrderLogicVertex's which are ephemeral to V3Order.
+        // - The ExecMTask graph and the AstMTaskBody's produced here
+        //   persist until code generation time.
+        state.m_execMTaskp =
+            new ExecMTask(execGraphp->mutableDepGraphp(),
+                          bodyp, mtaskp->id());
+        // Cross-link each ExecMTask and MTaskBody
+        //  Q: Why even have two objects?
+        //  A: One is an AstNode, the other is a GraphVertex,
+        //     to combine them would involve multiple inheritance...
+        state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
+        for (V3GraphEdge* inp = mtaskp->inBeginp();
+             inp; inp = inp->inNextp()) {
+            const V3GraphVertex* fromVxp = inp->fromp();
+            const AbstractLogicMTask* fromp =
+                dynamic_cast<const AbstractLogicMTask*>(fromVxp);
+            MTaskState& fromState = mtaskStates[fromp->id()];
+            new V3GraphEdge(execGraphp->mutableDepGraphp(),
+                            fromState.m_execMTaskp, state.m_execMTaskp, 1);
+        }
+        execGraphp->addMTaskBody(bodyp);
+    }
+}
+
 //######################################################################
 // OrderVisitor - Top processing
 
@@ -1762,7 +2008,7 @@ void OrderVisitor::process() {
 
     if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
 
-    {
+    if (!v3Global.opt.mtasks()) {
         UINFO(2,"  Construct Move Graph...\n");
         processMoveBuildGraph();
         if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start");  // Different prefix (ordermv) as it's not the same graph
@@ -1771,6 +2017,9 @@ void OrderVisitor::process() {
 
         UINFO(2,"  Move...\n");
         processMove();
+    } else {
+        UINFO(2,"  Set up mtasks...\n");
+        processMTasks();
     }
 
     // Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
diff --git a/src/V3OrderGraph.h b/src/V3OrderGraph.h
index e6ec2b096..4edd1bc3d 100644
--- a/src/V3OrderGraph.h
+++ b/src/V3OrderGraph.h
@@ -21,6 +21,7 @@
 //
 //	V3GraphVertex
 //	  OrderMoveVertex
+//        MTaskMoveVertex
 //	  OrderEitherVertex
 //	    OrderInputsVertex
 //	    OrderSettleVertex
@@ -47,6 +48,7 @@
 #include "verilatedos.h"
 #include "V3Ast.h"
 #include "V3Graph.h"
+#include VL_INCLUDE_UNORDERED_MAP
 
 class OrderVisitor;
 class OrderMoveVertex;
@@ -363,6 +365,57 @@ public:
     void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
 };
 
+// Similar to OrderMoveVertex, but modified for threaded code generation.
+class MTaskMoveVertex : public V3GraphVertex {
+    //  This could be more compact, since we know m_varp and m_logicp
+    //  cannot both be set. Each MTaskMoveVertex represents a logic node
+    //  or a var node, it can't be both.
+    OrderLogicVertex* m_logicp;  // Logic represented by this vertex
+    const OrderEitherVertex* m_varp;  // Var represented by this vertex
+    const AstScope* m_scopep;
+    const AstSenTree* m_domainp;
+
+protected:
+    friend class OrderVisitor;
+    friend class MTaskMoveVertexMaker;
+public:
+    MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
+                    const OrderEitherVertex* varp,
+                    const AstScope* scopep, const AstSenTree* domainp)
+        : V3GraphVertex(graphp), m_logicp(logicp),
+          m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
+        UASSERT(!(logicp && varp),
+                "MTaskMoveVertex: logicp and varp may not both be set!\n");
+    }
+    virtual ~MTaskMoveVertex() {}
+    virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
+      v3fatalSrc("Unsupported"); return NULL; }
+    virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
+    virtual string dotColor() const {
+        if (logicp()) return logicp()->dotColor();
+        else return "yellow";
+    }
+    virtual string name() const {
+        string nm;
+        if (logicp()) {
+            nm = logicp()->name();
+            nm += (string("\\nMV:")
+                   +" d="+cvtToStr((void*)logicp()->domainp())
+                   +" s="+cvtToStr((void*)logicp()->scopep())
+                   // "color()" represents the mtask ID.
+                   +"\\nt="+cvtToStr(color()));
+        } else {
+            nm = "nolog\\nt="+cvtToStr(color());
+        }
+        return nm;
+    }
+    // ACCESSORS
+    OrderLogicVertex* logicp() const { return m_logicp; }
+    const OrderEitherVertex* varp() const { return m_varp; }
+    const AstScope* scopep() const { return m_scopep; }
+    const AstSenTree* domainp() const { return m_domainp; }
+};
+
 //######################################################################
 // Edge types
 
diff --git a/src/V3Partition.cpp b/src/V3Partition.cpp
new file mode 100644
index 000000000..ad0ec9d47
--- /dev/null
+++ b/src/V3Partition.cpp
@@ -0,0 +1,2759 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
+//
+// Code available from: http://www.veripool.org/verilator
+//
+//*************************************************************************
+//
+// Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#include "config_build.h"
+#include "verilatedos.h"
+#include <list>
+#include <memory>
+#include <cstring>
+
+#include "V3Os.h"
+#include "V3File.h"
+#include "V3GraphAlg.h"
+#include "V3GraphPathChecker.h"
+#include "V3GraphStream.h"
+#include "V3InstrCount.h"
+#include "V3Partition.h"
+#include "V3PartitionGraph.h"
+#include "V3Scoreboard.h"
+#include "V3Stats.h"
+#include VL_INCLUDE_UNORDERED_SET
+
+class MergeCandidate;
+
+//######################################################################
+// Partitioner tunable settings:
+//
+// Before describing these settings, a bit of background:
+//
+// Early during the development of the partitioner, V3Split was failing to
+// split large always blocks (with ~100K assignments) so we had to handle
+// very large vertices with ~100K incoming and outgoing edges.
+//
+// The partitioner attempts to deal with such densely connected
+// graphs. Some of the tuning parameters below reference "huge vertices",
+// that's what they're talking about, vertices with tens of thousands of
+// edges in and out. Whereas most graphs have only tens of edges in and out
+// of most vertices.
+//
+// V3Split has since been fixed to more reliably split large always
+// blocks. It's kind of an open question whether the partitioner must
+// handle huge nodes gracefully. Maybe not!  But it still can, given
+// appropriate tuning.
+
+
+//   PART_SIBLING_EDGE_LIMIT (integer)
+//
+// Arbitrarily limit the number of edges on a single vertex that will be
+// considered when enumerating siblings, to the given value.  This protects
+// the partitioner runtime in the presence of huge vertices.
+//
+// The sibling-merge is less important than the edge merge.  (You can
+// totally disable the sibling merge and get halfway decent partitions; you
+// can't disable edge merges, those are fundamental to the process.) So,
+// skipping the enumeration of some siblings on a few vertices does not
+// have a large impact on the result of the partitioner.
+//
+// If your vertices are small, the limit (at 25) approaches a no-op.  Hence
+// there's basically no cost to applying this limit even when we don't
+// expect huge vertices.
+//
+// If you don't care about partitioner runtime and you want the most
+// aggressive partition, set the limit very high.  If you have huge
+// vertices, leave this as is.
+#define PART_SIBLING_EDGE_LIMIT 25
+
+
+//   PART_STEPPED_COST (boolean)
+//
+// When computing critical path costs, use a step function on the actual
+// underlying vertex cost.
+//
+// If there are huge vertices, when a tiny vertex merges into a huge
+// vertex, we can often avoid increasing the huge vertex's stepped cost.
+// If the stepped cost hasn't increased, and the critical path into the huge
+// vertex hasn't increased, we can avoid propagating a new critical path to
+// vertices past the huge vertex. Since huge vertices tend to have huge lists
+// of children and parents, this can be a substantial savings.
+//
+// Does not seem to reduce the quality of the partitioner's output.
+//
+// If you have huge vertices, leave this 'true', it is the major setting
+// that allows the partitioner to handle such difficult graphs on anything
+// like a human time scale.
+//
+// If you don't have huge vertices, the 'true' value doesn't help much but
+// should cost almost nothing in terms of partitioner quality.
+//
+// If you want the most aggressive possible partition, set it "false" and
+// be prepared to be dissappointed when the improvement in the partition is
+// negligible / in the noise.
+//
+// Q) Why retain the control, if there is really no downside?
+//
+// A) Cost stepping can lead to corner cases. A developer may wish to
+//    disable cost stepping to rule it out as the cause of unexpected
+//    behavior.
+#define PART_STEPPED_COST true
+
+
+//   PART_STEPPED_RESCORE_LIMIT (boolean)
+//
+// If false, we always try to merge the absolute lowest (best) scoring
+// mtask pair among all candidates.
+//
+// If true, we're willing to merge mtask pairs with scores up to 5% higher
+// (worse) than the best, in exchange for doing a Rescore() operation
+// somewhat less often.
+//
+// A true setting can result in a much faster compile in the presence of
+// huge vertices, eg. 45 minutes versus 4.5 minutes for one particular
+// model. HOWEVER, a true setting usually results in modestly worse
+// partitions, often around 10% more MTasks and 10% longer cycle times.
+//
+// (TODO: Why does this setting save time with huge vertices?
+// Is there a way to get best of both worlds without the trade off?)
+//
+// If you have huge vertices, you may wish to set this true.  If you don't
+// have huge vertices (which should be everyone, we think, now that V3Split
+// is fixed) leave it set false for the most aggressive partition.
+#define PART_STEPPED_RESCORE_LIMIT false
+
+
+// Don't produce more than a certain maximum number of MTasks.  This helps
+// the TSP variable sort not to blow up (a concern for some of the tests)
+// and we probably don't want a huge number of mtasks in practice anyway
+// (50 to 100 is typical.)
+//
+// If the user doesn't give one with '--threads-max-mtasks', we'll set the
+// maximum # of MTasks to
+//  (# of threads * PART_DEFAULT_MAX_MTASKS_PER_THREAD)
+#define PART_DEFAULT_MAX_MTASKS_PER_THREAD 50
+
+//   end tunables.
+
+//######################################################################
+// Misc graph and assertion utilities
+
+static void partCheckCachedScoreVsActual(uint32_t cached, uint32_t actual) {
+#if PART_STEPPED_COST
+    // Cached CP might be a little bigger than actual, due to stepped CPs.
+    // Example:
+    // Let's say we have a parent with stepped_cost 40 and a grandparent
+    // with stepped_cost 27. Our forward-cp is 67. Then our parent and
+    // grandparent get merged, the merged node has stepped cost 66.  We
+    // won't propagate that new CP to children as it hasn't grown.  So,
+    // children may continue to think that the CP coming through this path
+    // is a little higher than it really is; permit that.
+    UASSERT((((cached * 10) <= (actual * 11))
+             && (cached * 11) >= (actual * 10)),
+            "Calculation error in scoring (approximate, may need tweak)");
+#else
+    UASSERT(cached == actual, "Calculation error in scoring");
+#endif
+}
+
+//######################################################################
+// PartPropagateCp
+
+// Propagate increasing critical path (CP) costs through a graph.
+//
+// Usage:
+//  * Client increases the cost and/or CP at a node or small set of nodes
+//    (often a pair in practice, eg. edge contraction.)
+//  * Client instances a PartPropagateCp object
+//  * Client calls PartPropagateCp::cpHasIncreased() one or more times.
+//    Each call indicates that the inclusive CP of some "seed" vertex
+//    has increased to a given value.
+//    * NOTE: PartPropagateCp will neither read nor modify the cost
+//      or CPs at the seed vertices, it only accesses and modifies
+//      vertices wayward from the seeds.
+//  * Client calls PartPropagateCp::go(). Internally, this iteratively
+//    propagates the new CPs wayward through the graph.
+//
+template <class T_CostAccessor> class PartPropagateCp : GraphAlg<> {
+private:
+    // MEMBERS
+    GraphWay m_way;  // CPs oriented in this direction: either FORWARD
+    //               // from graph-start to current node, or REVERSE
+    //               // from graph-end to current node.
+    T_CostAccessor* m_accessp;  // Access cost and CPs on V3GraphVertex's.
+    vluint64_t m_generation;  // Mark each vertex with this number;
+    //                        // confirm we only process each vertex once.
+    bool m_slowAsserts;  // Enable nontrivial asserts
+    typedef SortByValueMap<V3GraphVertex*, uint32_t> PropCpPendSet;
+    PropCpPendSet m_pending;  // Pending rescores
+
+public:
+    // CONSTRUCTORS
+    PartPropagateCp(V3Graph* graphp, GraphWay way, T_CostAccessor* accessp,
+                    bool slowAsserts,
+                    V3EdgeFuncP edgeFuncp = &V3GraphEdge::followAlwaysTrue)
+        : GraphAlg<>(graphp, edgeFuncp)
+        , m_way(way)
+        , m_accessp(accessp)
+        , m_generation(0)
+        , m_slowAsserts(slowAsserts) {}
+
+    // METHODS
+    void cpHasIncreased(V3GraphVertex* vxp, uint32_t newInclusiveCp) {
+        // For *vxp, whose CP-inclusive has just increased to
+        // newInclusiveCp, iterate to all wayward nodes, update the edges
+        // of each, and add each to m_pending if its overall CP has grown.
+        for (V3GraphEdge* edgep = vxp->beginp(m_way);
+             edgep; edgep = edgep->nextp(m_way)) {
+            if (!m_edgeFuncp(edgep)) continue;
+            V3GraphVertex* relativep = edgep->furtherp(m_way);
+            m_accessp->notifyEdgeCp(relativep, m_way, vxp, newInclusiveCp);
+
+            if (m_accessp->critPathCost(relativep, m_way) < newInclusiveCp) {
+                // relativep's critPathCost() is out of step with its
+                // longest !wayward edge. Schedule that to be resolved.
+                uint32_t newPendingVal =
+                    newInclusiveCp - m_accessp->critPathCost(relativep, m_way);
+                if (m_pending.has(relativep)) {
+                    if (newPendingVal > m_pending.at(relativep)) {
+                        m_pending.set(relativep, newPendingVal);
+                    }
+                } else {
+                    m_pending.set(relativep, newPendingVal);
+                }
+            }
+        }
+    }
+
+    void go() {
+        // m_pending maps each pending vertex to the amount that it wayward
+        // CP will grow.
+        //
+        // We can iterate over the pending set in reverse order, always
+        // choosing the nodes with the largest pending CP-growth.
+        //
+        // The intuition is: if the original seed node had its CP grow by
+        // 50, the most any wayward node can possibly grow is also 50.  So
+        // for anything pending to grow by 50, we know we can process it
+        // once and we won't have to grow its CP again on the current pass.
+        // After we're done with all the grow-by-50s, nothing else will
+        // grow by 50 again on the current pass, and we can process the
+        // grow-by-49s and we know we'll only have to process each one
+        // once.  And so on.
+        //
+        // This generalizes to multiple seed nodes also.
+        while (!m_pending.empty()) {
+            PropCpPendSet::reverse_iterator it = m_pending.rbegin();
+            V3GraphVertex* updateMep = (*it).key();
+            uint32_t cpGrowBy = (*it).value();
+            m_pending.erase(it);
+
+            // For *updateMep, whose critPathCost was out-of-date with respect
+            // to its edges, update the critPathCost.
+            uint32_t startCp = m_accessp->critPathCost(updateMep, m_way);
+            uint32_t newCp = startCp + cpGrowBy;
+            if (m_slowAsserts) {
+                m_accessp->checkNewCpVersusEdges(updateMep, m_way, newCp);
+            }
+
+            m_accessp->setCritPathCost(updateMep, m_way, newCp);
+            cpHasIncreased(updateMep, newCp + m_accessp->cost(updateMep));
+        }
+    }
+
+private:
+    VL_DEBUG_FUNC;
+    VL_UNCOPYABLE(PartPropagateCp);
+};
+
+class PartPropagateCpSelfTest {
+private:
+    // MEMBERS
+    V3Graph m_graph;  // A graph
+    V3GraphVertex* m_vx[50];  // All vertices within the graph
+    typedef vl_unordered_map<V3GraphVertex*, uint32_t> CpMap;
+    CpMap m_cp;  // Vertex-to-CP map
+    CpMap m_seen;  // Set of vertices we've seen
+
+    // CONSTRUCTORS
+    PartPropagateCpSelfTest() {}
+    ~PartPropagateCpSelfTest() {}
+
+    // METHODS
+protected:
+    friend class PartPropagateCp<PartPropagateCpSelfTest>;
+    void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way,
+                      V3GraphVertex* throughp, uint32_t cp) const {
+        uint32_t throughCost = critPathCost(throughp, way);
+        UASSERT_SELFTEST(uint32_t, cp, (1 + throughCost));
+    }
+private:
+    void checkNewCpVersusEdges(V3GraphVertex* vxp,
+                               GraphWay way, uint32_t cp) const {
+        // Don't need to check this in the self test; it supports an assert
+        // that runs in production code.
+    }
+    void setCritPathCost(V3GraphVertex* vxp,
+                         GraphWay way, uint32_t cost) {
+        m_cp[vxp] = cost;
+        // Confirm that we only set each node's CP once.  That's an
+        // important property of PartPropagateCp which allows it to be far
+        // faster than a recursive algorithm on some graphs.
+        CpMap::iterator it = m_seen.find(vxp);
+        if (it != m_seen.end()) vxp->v3fatalSrc("Set CP on node twice");
+        m_seen[vxp] = cost;
+    }
+    uint32_t critPathCost(V3GraphVertex* vxp, GraphWay way) const {
+        CpMap::const_iterator it = m_cp.find(vxp);
+        if (it != m_cp.end()) return it->second;
+        return 0;
+    }
+    uint32_t cost(const V3GraphVertex*) const { return 1; }
+    void partInitCriticalPaths(bool checkOnly) {
+        // Set up the FORWARD cp's only.  This test only looks in one
+        // direction, it assumes REVERSE is symmetrical and would be
+        // redundant to test.
+        GraphStreamUnordered order(&m_graph);
+        while (const V3GraphVertex* cvxp = order.nextp()) {
+            V3GraphVertex* vxp = const_cast<V3GraphVertex*>(cvxp);
+            uint32_t cpCost = 0;
+            for (V3GraphEdge* edgep = vxp->inBeginp();
+                 edgep; edgep = edgep->inNextp()) {
+                V3GraphVertex* parentp = edgep->fromp();
+                cpCost = std::max(cpCost,
+                                  critPathCost(parentp, GraphWay::FORWARD) + 1);
+            }
+            if (checkOnly) {
+                UASSERT_SELFTEST(uint32_t, cpCost,
+                                 critPathCost(vxp, GraphWay::FORWARD));
+            } else {
+                setCritPathCost(vxp, GraphWay::FORWARD, cpCost);
+            }
+        }
+    }
+    void go() {
+        // Generate a pseudo-random graph
+        uint16_t rngState[3] = { 0xdead, 0xbeef, 0xf000 };
+        // Create 50 vertices
+        for (unsigned i = 0; i < 50; ++i) {
+            m_vx[i] = new V3GraphVertex(&m_graph);
+        }
+        // Create 250 edges at random. Edges must go from
+        // lower-to-higher index vertices, so we get a DAG.
+        for (unsigned i = 0; i < 250; ++i) {
+            unsigned idx1 = nrand48(rngState) % 50;
+            unsigned idx2 = nrand48(rngState) % 50;
+            if (idx1 > idx2) {
+                new V3GraphEdge(&m_graph, m_vx[idx2], m_vx[idx1], 1);
+            } else if (idx2 > idx1) {
+                new V3GraphEdge(&m_graph, m_vx[idx1], m_vx[idx2], 1);
+            }
+        }
+
+        partInitCriticalPaths(false);
+
+        // This SelfTest class is also the T_CostAccessor
+        PartPropagateCp<PartPropagateCpSelfTest>
+            prop(&m_graph, GraphWay::FORWARD, this, true);
+
+        // Seed the propagator with every input node;
+        // This should result in the complete graph getting all CP's assigned.
+        for (unsigned i = 0; i < 50; ++i) {
+            if (!m_vx[i]->inBeginp()) {
+                prop.cpHasIncreased(m_vx[i], 1 /* inclusive CP starts at 1 */);
+            }
+        }
+
+        // Run the propagator.
+        //  * The setCritPathCost() routine checks that each node's CP changes
+        //    at most once.
+        //  * The notifyEdgeCp routine is also self checking.
+        m_seen.clear();
+        prop.go();
+
+        // Finally, confirm that the entire graph appears to have correct CPs.
+        partInitCriticalPaths(true);
+    }
+public:
+    static void selfTest() {
+        PartPropagateCpSelfTest().go();
+    }
+};
+
+//######################################################################
+// LogicMTask
+
+class LogicMTask : public AbstractLogicMTask {
+public:
+    // TYPES
+    typedef std::list<MTaskMoveVertex*> VxList;
+
+    struct CmpLogicMTask {
+        bool operator() (const LogicMTask* ap, const LogicMTask* bp) const {
+            return ap->id() < bp->id();
+        }
+    };
+
+    // This adaptor class allows the PartPropagateCp class to be somewhat
+    // independent of the LogicMTask class
+    //  - PartPropagateCp can thus be declared before LogicMTask
+    //  - PartPropagateCp could be reused with graphs of other node types
+    //    in the future, using another Accessor adaptor.
+    class CpCostAccessor {
+    public:
+        CpCostAccessor() {}
+        ~CpCostAccessor() {}
+        // Return cost of this node
+        uint32_t cost(const V3GraphVertex* vxp) const {
+            const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
+            return mtaskp->stepCost();
+        }
+        // Return stored CP to this node
+        uint32_t critPathCost(const V3GraphVertex* vxp, GraphWay way) const {
+            const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
+            return mtaskp->critPathCost(way);
+        }
+        // Store a new CP to this node
+        void setCritPathCost(V3GraphVertex* vxp,
+                             GraphWay way, uint32_t cost) const {
+            LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
+            mtaskp->setCritPathCost(way, cost);
+        }
+        // Notify vxp that the wayward CP at the throughp-->vxp edge
+        // has increased to 'cp'. (vxp is wayward from throughp.)
+        // This is our cue to update vxp's m_edges[!way][throughp].
+        void notifyEdgeCp(V3GraphVertex* vxp, GraphWay way,
+                          V3GraphVertex* throuvhVxp, uint32_t cp) const {
+            LogicMTask* updateVxp = dynamic_cast<LogicMTask*>(vxp);
+            LogicMTask* lthrouvhVxp = dynamic_cast<LogicMTask*>(throuvhVxp);
+            EdgeSet& edges = updateVxp->m_edges[way.invert()];
+            uint32_t edgeCp = edges.at(lthrouvhVxp);
+            if (cp > edgeCp) edges.set(lthrouvhVxp, cp);
+        }
+        // Check that CP matches that of the longest edge wayward of vxp.
+        void checkNewCpVersusEdges(V3GraphVertex* vxp,
+                                   GraphWay way, uint32_t cp) const {
+            LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
+            EdgeSet& edges = mtaskp->m_edges[way.invert()];
+            // This is mtaskp's relative with longest !wayward inclusive CP:
+            EdgeSet::reverse_iterator edgeIt = edges.rbegin();
+            uint32_t edgeCp = (*edgeIt).value();
+            if (edgeCp != cp) vxp->v3fatalSrc("CP doesn't match longest wayward edge");
+        }
+    private:
+        VL_UNCOPYABLE(CpCostAccessor);
+    };
+
+private:
+    // MEMBERS
+
+    // Set of MTaskMoveVertex's assigned to this mtask. LogicMTask does not
+    // own the MTaskMoveVertex objects, we merely keep pointers to them
+    // here.
+    VxList m_vertices;
+
+    // Cost estimate for this LogicMTask, derived from V3InstrCount.
+    // In abstract time units.
+    uint32_t m_cost;
+
+    // Cost of critical paths going FORWARD from graph-start to the start
+    // of this vertex, and also going REVERSE from the end of the graph to
+    // the end of the vertex. Same units as m_cost.
+    uint32_t m_critPathCost[GraphWay::NUM_WAYS];
+
+    uint32_t m_serialId;  // Unique MTask ID number
+
+    // Count "generations" which are just operations that scan through the
+    // graph. We'll mark each node with the last generation that scanned
+    // it. We can use this to avoid recursing through the same node twice
+    // while searching for a path.
+    vluint64_t m_generation;
+
+    // Redundant with the V3GraphEdge's, store a map of relatives so we can
+    // quickly check if we have a given parent or child.
+    //
+    // 'm_edges[way]' maps a wayward relative to the !way critical path at
+    // our edge with them. The SortByValueMap supports iterating over
+    // relatives in longest-to-shortest CP order.  We rely on this ordering
+    // in more than one place.
+    typedef SortByValueMap<LogicMTask*, uint32_t, CmpLogicMTask> EdgeSet;
+    EdgeSet m_edges[GraphWay::NUM_WAYS];
+
+public:
+    // CONSTRUCTORS
+    LogicMTask(V3Graph* graphp, MTaskMoveVertex* mtmvVxp)
+        : AbstractLogicMTask(graphp)
+        , m_cost(0)
+        , m_generation(0) {
+        for (int i=0; i<GraphWay::NUM_WAYS; ++i) m_critPathCost[i] = 0;
+        if (mtmvVxp) {  // Else null for test
+            m_vertices.push_back(mtmvVxp);
+            if (OrderLogicVertex* olvp = mtmvVxp->logicp()) {
+                m_cost += V3InstrCount::count(olvp->nodep(), true);
+            }
+        }
+        // Start at 1, so that 0 indicates no mtask ID.
+        static uint32_t s_nextId = 1;
+        m_serialId = s_nextId++;
+        UASSERT(s_nextId < 0xFFFFFFFFUL, "Too many mtasks");
+    }
+
+    // METHODS
+    void moveAllVerticesFrom(LogicMTask* otherp) {
+        // splice() is constant time
+        m_vertices.splice(m_vertices.end(), otherp->m_vertices);
+        m_cost += otherp->m_cost;
+    }
+    virtual const VxList* vertexListp() const {
+        return &m_vertices;
+    }
+    static vluint64_t incGeneration() {
+        static vluint64_t s_generation = 0;
+        ++s_generation;
+        return s_generation;
+    }
+
+    // Use this instead of pointer-compares to compare LogicMTasks. Avoids
+    // nondeterministic output.  Also name mtasks based on this number in
+    // the final C++ output.
+    virtual uint32_t id() const { return m_serialId; }
+    void id(uint32_t id) { m_serialId = id; }
+    // Abstract cost of every logic mtask
+    virtual uint32_t cost() const { return m_cost; }
+    void setCost(uint32_t cost) { m_cost = cost; }  // For tests only
+    uint32_t stepCost() const { return stepCost(m_cost); }
+    static uint32_t stepCost(uint32_t cost) {
+#if PART_STEPPED_COST
+        // Round cost up to the nearest 5%. Use this when computing all
+        // critical paths. The idea is that critical path changes don't
+        // need to propagate when they don't exceed the next step, saving a
+        // lot of recursion.
+        if (cost == 0) return 0;
+
+        double logcost = log(cost);
+        // log(1.05) is about 0.05
+        // So, round logcost up to the next 0.05 boundary
+        logcost *= 20.0;
+        logcost = ceil(logcost);
+        logcost = logcost / 20.0;
+
+        uint32_t stepCost = (uint32_t)(exp(logcost));
+        UASSERT_STATIC(stepCost >= cost, "stepped cost error exceeded");
+        UASSERT_STATIC(stepCost <= ((cost * 11 / 10)), "stepped cost error exceeded");
+        return stepCost;
+#else
+        return cost;
+#endif
+    }
+
+    void addRelative(GraphWay way, LogicMTask* relativep) {
+        EdgeSet& edges = m_edges[way];
+        UASSERT(!edges.has(relativep), "Adding existing edge");
+        // value is !way cp to this edge
+        edges.set(relativep,
+                  relativep->stepCost()
+                  + relativep->critPathCost(way.invert()));
+    }
+    void removeRelative(GraphWay way, LogicMTask* relativep) {
+        EdgeSet& edges = m_edges[way];
+        edges.erase(relativep);
+    }
+    bool hasRelative(GraphWay way, LogicMTask* relativep) {
+        EdgeSet& edges = m_edges[way];
+        return edges.has(relativep);
+    }
+    void checkRelativesCp(GraphWay way) const {
+        const EdgeSet& edges = m_edges[way];
+        for (EdgeSet::const_reverse_iterator it = edges.rbegin();
+             it != edges.rend(); ++it) {
+            LogicMTask* relativep = (*it).key();
+            uint32_t cachedCp = (*it).value();
+            partCheckCachedScoreVsActual
+                (cachedCp,
+                 relativep->critPathCost(way.invert()) + relativep->stepCost());
+        }
+    }
+
+    virtual string name() const {
+        // Display forward and reverse critical path costs. This gives a quick
+        // read on whether graph partitioning looks reasonable or bad.
+        std::ostringstream out;
+        out <<"mt"<<m_serialId<<"."<<this
+            <<" [b"<<m_critPathCost[GraphWay::FORWARD]
+            <<" a"<<m_critPathCost[GraphWay::REVERSE]
+            <<" c"<<cost();
+        return out.str();
+    }
+
+    void setCritPathCost(GraphWay way, uint32_t cost) { m_critPathCost[way] = cost; }
+    uint32_t critPathCost(GraphWay way) const { return m_critPathCost[way]; }
+    uint32_t critPathCostWithout(GraphWay way,
+                                 const V3GraphEdge* withoutp) const {
+        // Compute the critical path cost wayward to this node, without
+        // considering edge 'withoutp'
+        UASSERT(this == withoutp->furtherp(way),
+                "In critPathCostWithout(), edge 'withoutp' must "
+                "further to 'this'");
+
+        // Iterate through edges until we get a relative other than
+        // wayEdgeEndp(way, withoutp). This should take 2 iterations max.
+        const EdgeSet& edges = m_edges[way.invert()];
+        uint32_t result = 0;
+        for (EdgeSet::const_reverse_iterator it = edges.rbegin();
+             it != edges.rend(); ++it) {
+            if ((*it).key() != withoutp->furtherp(way.invert())) {
+                // Use the cached cost. It could be a small overestimate
+                // due to stepping. This is consistent with critPathCost()
+                // which also returns the cached cost.
+                result = (*it).value();
+                break;
+            }
+        }
+        return result;
+    }
+
+private:
+    static bool pathExistsFromInternal(LogicMTask* fromp,
+                                       LogicMTask* top,
+                                       const V3GraphEdge* excludedEdgep,
+                                       vluint64_t generation) {
+        // Q) Why does this take LogicMTask instead of generic V3GraphVertex?
+        // A) We'll use the critical paths known to LogicMTask to prune the
+        //    recursion for speed. Also store 'generation' in
+        //    LogicMTask::m_generation so we can prune the search and avoid
+        //    recursing through the same node more than once in a single
+        //    search.
+
+        if (fromp->m_generation == generation) {
+            // Already looked at this node in the current search.
+            // Since we're back again, we must not have found a path on the
+            // first go.
+            return false;
+        }
+        fromp->m_generation = generation;
+
+        // Base case: we found a path.
+        if (fromp == top) return true;
+
+        // Base case: fromp is too late, cannot possibly be a prereq for top.
+        if (fromp->critPathCost(GraphWay::REVERSE)
+            < (top->critPathCost(GraphWay::REVERSE) + top->stepCost())) return false;
+        if ((fromp->critPathCost(GraphWay::FORWARD) + fromp->stepCost())
+            > top->critPathCost(GraphWay::FORWARD)) return false;
+
+        // Recursively look for a path
+        for (const V3GraphEdge* followp = fromp->outBeginp();
+             followp; followp = followp->outNextp()) {
+            if (followp == excludedEdgep) continue;
+            LogicMTask* nextp = dynamic_cast<LogicMTask*>(followp->top());
+            if (pathExistsFromInternal(nextp, top, NULL, generation))
+                return true;
+        }
+        return false;
+    }
+
+    // True if there's a path from 'fromp' to 'top' excluding
+    // 'excludedEdgep', false otherwise.
+    //
+    // 'excludedEdgep' may be NULL in which case no edge is excluded.  If
+    // 'excludedEdgep' is non-NULL it must connect fromp and top.
+    //
+    // TODO: consider changing this API to the 'isTransitiveEdge' API
+    // used by GraphPathChecker
+public:
+    static bool pathExistsFrom(LogicMTask* fromp,
+                               LogicMTask* top,
+                               const V3GraphEdge* excludedEdgep) {
+        return pathExistsFromInternal(fromp, top, excludedEdgep,
+                                      incGeneration());
+    }
+
+    static void dumpCpFilePrefixed(const V3Graph* graphp,
+                                   const string& nameComment) {
+        string filename = v3Global.debugFilename(nameComment)+".txt";
+        UINFO(1,"Writing "<<filename<<endl);
+        vl_unique_ptr<std::ofstream> ofp(V3File::new_ofstream(filename));
+        std::ostream* osp = &(*ofp);  // &* needed to deref unique_ptr
+        if (osp->fail()) v3fatalStatic("Can't write "<<filename);
+
+        // Find start vertex with longest CP
+        const LogicMTask* startp = NULL;
+        for (const V3GraphVertex* vxp = graphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            const LogicMTask* mtaskp = dynamic_cast<const LogicMTask*>(vxp);
+            if (!startp) {
+                startp = mtaskp;
+                continue;
+            }
+            if (mtaskp->cost() + mtaskp->critPathCost(GraphWay::REVERSE)
+                > startp->cost() + startp->critPathCost(GraphWay::REVERSE)) {
+                startp = mtaskp;
+            }
+        }
+
+        // Follow the entire critical path
+        std::vector<const LogicMTask*> path;
+        uint32_t totalCost = 0;
+        for (const LogicMTask* nextp = startp; nextp;) {
+            path.push_back(nextp);
+            totalCost += nextp->cost();
+
+            const EdgeSet& children = nextp->m_edges[GraphWay::FORWARD];
+            EdgeSet::const_reverse_iterator it = children.rbegin();
+            if (it == children.rend()) nextp = NULL;
+            else nextp = (*it).key();
+        }
+
+        *osp<<"totalCost = "<<totalCost
+            <<" (should match the computed critical path cost (CP) for the graph)\n";
+
+        // Dump
+        for (std::vector<const LogicMTask*>::iterator it = path.begin();
+             it != path.end(); ++it) {
+            const LogicMTask* mtaskp = *it;
+            *osp<<"begin mtask with cost "<<mtaskp->cost()<<endl;
+            for (VxList::const_iterator lit = mtaskp->vertexListp()->begin();
+                 lit != mtaskp->vertexListp()->end(); ++lit) {
+                const OrderLogicVertex* logicp = (*lit)->logicp();
+                if (!logicp) continue;
+                if (0) {
+                    // Show nodes only
+                    *osp<<"> "; logicp->nodep()->dumpTree(*osp);
+                } else {
+                    // Show nodes with hierarchical costs
+                    V3InstrCount::count(logicp->nodep(), false, osp);
+                }
+            }
+        }
+    }
+
+private:
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(LogicMTask);
+};
+
+//######################################################################
+// MTask utility classes
+
+// Sort AbstractMTask objects into deterministic order by calling id()
+// which is a unique and stable serial number.
+class MTaskIdLessThan {
+public:
+    MTaskIdLessThan() {}
+    virtual ~MTaskIdLessThan() {}
+    virtual bool operator() (const AbstractMTask* lhsp,
+                             const AbstractMTask* rhsp) const {
+        return lhsp->id() < rhsp->id();
+    }
+};
+
+// Information associated with scoreboarding an MTask
+class MergeCandidate {
+private:
+    bool m_removedFromSb;  // Not on scoreboard, generally ignore
+    vluint64_t m_id;  // Serial number for ordering
+public:
+    // CONSTRUCTORS
+    MergeCandidate() : m_removedFromSb(false) {
+        static vluint64_t serial = 0;
+        ++serial;
+        m_id = serial;
+    }
+    virtual bool mergeWouldCreateCycle() const = 0;
+    // METHODS
+    bool removedFromSb() const { return m_removedFromSb; }
+    void removedFromSb(bool removed) { m_removedFromSb = removed; }
+    bool operator<(const MergeCandidate& other) const {
+        return m_id < other.m_id;
+    }
+};
+
+// A pair of associated LogicMTask's that are merge candidates for sibling
+// contraction
+class SiblingMC : public MergeCandidate {
+private:
+    LogicMTask* m_ap;
+    LogicMTask* m_bp;
+    // CONSTRUCTORS
+    SiblingMC() VL_EQ_DELETE;
+public:
+    SiblingMC(LogicMTask* ap, LogicMTask* bp) {
+        // Assign 'ap' and 'bp' in a canonical order, so we can more easily
+        // compare pairs of SiblingMCs
+        if (ap->id() > bp->id()) {
+            m_ap = ap;
+            m_bp = bp;
+        } else {
+            m_ap = bp;
+            m_bp = ap;
+        }
+    }
+    virtual ~SiblingMC() {}
+    // METHODS
+    LogicMTask* ap() const { return m_ap; }
+    LogicMTask* bp() const { return m_bp; }
+    bool mergeWouldCreateCycle() const {
+        return (LogicMTask::pathExistsFrom(m_ap, m_bp, NULL)
+                || LogicMTask::pathExistsFrom(m_bp, m_ap, NULL));
+    }
+    bool operator<(const SiblingMC& other) const {
+        if (m_ap->id() < other.m_ap->id()) { return true; }
+        if (m_ap->id() > other.m_ap->id()) { return false; }
+        return m_bp->id() < other.m_bp->id();
+    }
+};
+
+// GraphEdge for the MTask graph
+class MTaskEdge : public V3GraphEdge, public MergeCandidate {
+public:
+    // CONSTRUCTORS
+    MTaskEdge(V3Graph* graphp, LogicMTask* fromp, LogicMTask* top, int weight)
+        : V3GraphEdge(graphp, fromp, top, weight),
+          MergeCandidate() {
+        fromp->addRelative(GraphWay::FORWARD, top);
+        top->addRelative(GraphWay::REVERSE, fromp);
+    }
+    virtual ~MTaskEdge() {
+        fromMTaskp()->removeRelative(GraphWay::FORWARD, toMTaskp());
+        toMTaskp()->removeRelative(GraphWay::REVERSE, fromMTaskp());
+    }
+    // METHODS
+    LogicMTask* furtherMTaskp(GraphWay way) const {
+        return dynamic_cast<LogicMTask*>(this->furtherp(way));
+    }
+    LogicMTask* fromMTaskp() const {
+        return dynamic_cast<LogicMTask*>(fromp());
+    }
+    LogicMTask* toMTaskp() const {
+        return dynamic_cast<LogicMTask*>(top());
+    }
+    virtual bool mergeWouldCreateCycle() const {
+        return LogicMTask::pathExistsFrom(fromMTaskp(), toMTaskp(), this);
+    }
+    static MTaskEdge* cast(V3GraphEdge* edgep) {
+        if (!edgep) return NULL;
+        MTaskEdge* resultp = dynamic_cast<MTaskEdge*>(edgep);
+        UASSERT(resultp, "Failed to cast in MTaskEdge::cast");
+        return resultp;
+    }
+    // Following initial assignment of critical paths, clear this MTaskEdge
+    // out of the edge-map for each node and reinsert at a new location
+    // with updated critical path.
+    void resetCriticalPaths() {
+        LogicMTask* fromp = fromMTaskp();
+        LogicMTask* top = toMTaskp();
+        fromp->removeRelative(GraphWay::FORWARD, top);
+        top->removeRelative(GraphWay::REVERSE, fromp);
+        fromp->addRelative(GraphWay::FORWARD, top);
+        top->addRelative(GraphWay::REVERSE, fromp);
+    }
+private:
+    VL_UNCOPYABLE(MTaskEdge);
+};
+
+//######################################################################
+// Vertex utility classes
+
+class OrderByPtrId {
+    PartPtrIdMap m_ids;
+public:
+    virtual bool operator() (const OrderVarStdVertex* lhsp,
+                             const OrderVarStdVertex* rhsp) const {
+        vluint64_t l_id = m_ids.findId(lhsp);
+        vluint64_t r_id = m_ids.findId(rhsp);
+        return l_id < r_id;
+    }
+};
+
+//######################################################################
+// PartParallelismEst - Estimate parallelism of graph
+
+class PartParallelismEst {
+    // MEMBERS
+    const V3Graph* m_graphp;  // Mtask-containing graph
+
+    // Total cost of evaluating the whole graph.
+    // The ratio of m_totalGraphCost to longestCpCost gives us an estimate
+    // of the parallelizability of this graph which is only as good as the
+    // guess returned by LogicMTask::cost().
+    uint32_t m_totalGraphCost;
+
+    // Cost of the longest critical path, in abstract units (the same units
+    // returned by the vertexCost)
+    uint32_t m_longestCpCost;
+
+    size_t m_vertexCount;  // Number of vertexes calculated
+    size_t m_edgeCount;  // Number of edges calculated
+
+public:
+    // CONSTRUCTORS
+    explicit PartParallelismEst(const V3Graph* graphp)
+        : m_graphp(graphp),
+          m_totalGraphCost(0),
+          m_longestCpCost(0),
+          m_vertexCount(0),
+          m_edgeCount(0) {}
+
+    // METHODS
+    uint32_t totalGraphCost() const { return m_totalGraphCost; }
+    uint32_t longestCritPathCost() const { return m_longestCpCost; }
+    size_t vertexCount() const { return m_vertexCount; }
+    size_t edgeCount() const { return m_edgeCount; }
+    double parallelismFactor() const {
+        return (static_cast<double>(m_totalGraphCost) / m_longestCpCost);
+    }
+    void traverse() {
+        // For each node, record the critical path cost from the start
+        // of the graph through the end of the node.
+        vl_unordered_map<const V3GraphVertex*, uint32_t> critPaths;
+        GraphStreamUnordered serialize(m_graphp);
+        for (const V3GraphVertex* vertexp;
+             (vertexp = serialize.nextp());) {
+            m_vertexCount++;
+            uint32_t cpCostToHere = 0;
+            for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep;
+                 edgep = edgep->inNextp()) {
+                ++m_edgeCount;
+                // For each upstream item, add its critical path cost to
+                // the cost of this edge, to form a new candidate critical
+                // path cost to the current node. Whichever is largest is
+                // the critical path to reach the start of this node.
+                cpCostToHere = std::max(cpCostToHere, critPaths[edgep->fromp()]);
+            }
+            // Include the cost of the current vertex in the critical
+            // path, so it represents the critical path to the end of
+            // this vertex.
+            cpCostToHere += vertexCost(vertexp);
+            critPaths[vertexp] = cpCostToHere;
+            m_longestCpCost = std::max(m_longestCpCost, cpCostToHere);
+            // Tally the total cost contributed by vertices.
+            m_totalGraphCost += vertexCost(vertexp);
+        }
+    }
+    void statsReport(const string& stage) {
+        V3Stats::addStat("MTask graph, "+stage+", critical path cost",
+                         m_longestCpCost);
+        V3Stats::addStat("MTask graph, "+stage+", total graph cost",
+                         m_totalGraphCost);
+        V3Stats::addStat("MTask graph, "+stage+", mtask count",
+                         m_vertexCount);
+        V3Stats::addStat("MTask graph, "+stage+", edge count",
+                         m_edgeCount);
+        V3Stats::addStat("MTask graph, "+stage+", parallelism factor",
+                         parallelismFactor());
+    }
+    void debugReport() {
+        UINFO(0, "    Critical path cost = "<<m_longestCpCost<<endl);
+        UINFO(0, "    Total graph cost = "<<m_totalGraphCost<<endl);
+        UINFO(0, "    MTask vertex count = "<<m_vertexCount<<endl);
+        UINFO(0, "    Edge count = "<<m_edgeCount<<endl);
+        UINFO(0, "    Parallelism factor = "<<parallelismFactor()<<endl);
+    }
+    static uint32_t vertexCost(const V3GraphVertex* vertexp) {
+        return dynamic_cast<const AbstractMTask*>(vertexp)->cost();
+    }
+
+private:
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(PartParallelismEst);
+};
+
+//######################################################################
+
+// Look at vertex costs (in one way) to form critical paths for each
+// vertex.
+static void partInitHalfCriticalPaths(GraphWay way, V3Graph* mtasksp, bool checkOnly) {
+    GraphStreamUnordered order(mtasksp, way);
+    GraphWay rev = way.invert();
+    for (const V3GraphVertex* vertexp;
+         (vertexp = order.nextp());) {
+        const LogicMTask* mtaskcp = dynamic_cast<const LogicMTask*>(vertexp);
+        LogicMTask* mtaskp = const_cast<LogicMTask*>(mtaskcp);
+        uint32_t cpCost = 0;
+        vl_unordered_set<V3GraphVertex*> relatives;
+        for (V3GraphEdge* edgep = vertexp->beginp(rev);
+             edgep; edgep = edgep->nextp(rev)) {
+            // Run a few asserts on the initial mtask graph,
+            // while we're iterating through...
+            if (edgep->weight() == 0) {
+                mtaskp->v3fatalSrc("Should be no cut edges in mtasks graph");
+            }
+            if (relatives.find(edgep->furtherp(rev)) != relatives.end()) {
+                mtaskp->v3fatalSrc("Should be no redundant edges in mtasks graph");
+            }
+            relatives.insert(edgep->furtherp(rev));
+
+            LogicMTask* relativep
+                = dynamic_cast<LogicMTask*>(edgep->furtherp(rev));
+            cpCost = std::max(cpCost,
+                              (relativep->critPathCost(way)
+                               + static_cast<uint32_t>(relativep->stepCost())));
+        }
+        if (checkOnly) {
+            partCheckCachedScoreVsActual(mtaskp->critPathCost(way), cpCost);
+        } else {
+            mtaskp->setCritPathCost(way, cpCost);
+        }
+    }
+}
+
+// Look at vertex costs to form critical paths for each vertex.
+static void partInitCriticalPaths(V3Graph* mtasksp) {
+    partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, false);
+    partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, false);
+
+    // Reset all MTaskEdges so that 'm_edges' will show correct CP numbers.
+    // They would have been all zeroes on initial creation of the MTaskEdges.
+    std::vector<V3GraphEdge*> edges;
+    for (V3GraphVertex* vxp = mtasksp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        for (V3GraphEdge* edgep = vxp->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            MTaskEdge* mtedgep = dynamic_cast<MTaskEdge*>(edgep);
+            mtedgep->resetCriticalPaths();
+        }
+    }
+}
+
+// Do an EXPENSIVE check to make sure that all incremental CP updates have
+// gone correctly.
+static void partCheckCriticalPaths(V3Graph* mtasksp) {
+    partInitHalfCriticalPaths(GraphWay::FORWARD, mtasksp, true);
+    partInitHalfCriticalPaths(GraphWay::REVERSE, mtasksp, true);
+    for (V3GraphVertex* vxp = mtasksp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
+        mtaskp->checkRelativesCp(GraphWay::FORWARD);
+        mtaskp->checkRelativesCp(GraphWay::REVERSE);
+    }
+}
+
+// Advance to nextp(way) and delete edge
+static V3GraphEdge* partBlastEdgep(GraphWay way, V3GraphEdge* edgep) {
+    V3GraphEdge* nextp = edgep->nextp(way);
+    edgep->unlinkDelete(); VL_DANGLING(edgep);
+    return nextp;
+}
+
+// Merge edges from a LogicMtask.
+//
+// This code removes 'hasRelative' edges. When this occurs, mark it in need
+// of a rescore, in case its score has fallen and we need to move it up
+// toward the front of the scoreboard.
+//
+// Wait, whaaat? Shouldn't the scores only increase as we merge nodes? Well
+// that's almost true. But there is one exception.
+//
+// Suppose we have A->B, B->C, and A->C.
+//
+// The A->C edge is a "transitive" edge. It's ineligible to be merged, as
+// the merge would create a cycle. We score it on the scoreboard like any
+// other edge.
+//
+// However, our "score" estimate for A->C is bogus, because the forward
+// critical path to C and the reverse critical path to A both contain the
+// same node (B) so we overestimate the score of A->C. At first this
+// doesn't matter, since transitive edges aren't eligible to merge anyway.
+//
+// Later, suppose the edge contractor decides to merge the B->C edge, with
+// B donating all its incoming edges into C, say.  (So we reach this
+// function.)
+//
+// With B going away, the A->C edge will no longer be transitive and it
+// will become eligible to merge. But if we don't mark it for rescore,
+// it'll stay in the scoreboard with its old (overestimate) score. We'll
+// merge it too late due to the bogus score. When we finally merge it, we
+// fail the assert in the main edge contraction loop which checks that the
+// actual score did not fall below the scoreboard's score.
+//
+// Another way of stating this: this code ensures that scores of
+// non-transitive edges only ever increase.
+static void partMergeEdgesFrom(V3Graph* mtasksp, LogicMTask* recipientp,
+                               LogicMTask* donorp,
+                               V3Scoreboard<MergeCandidate, uint32_t>* sbp) {
+    for (unsigned wi = 0; wi < 2; ++wi) {
+        GraphWay way = wi ? GraphWay::REVERSE : GraphWay::FORWARD;
+        for (V3GraphEdge* edgep = donorp->beginp(way);
+             edgep; edgep = partBlastEdgep(way, edgep)) {
+            MTaskEdge* tedgep = MTaskEdge::cast(edgep);
+            if (sbp && !tedgep->removedFromSb())
+                sbp->removeElem(tedgep);
+            // Existing edge; mark it in need of a rescore
+            if (recipientp->hasRelative(way, tedgep->furtherMTaskp(way))) {
+                if (sbp) {
+                    MTaskEdge* existMTaskEdgep =
+                        MTaskEdge::cast(recipientp->findConnectingEdgep
+                                        (way, tedgep->furtherMTaskp(way)));
+                    UASSERT(existMTaskEdgep, "findConnectingEdge didn't find edge");
+                    if (!existMTaskEdgep->removedFromSb()) {
+                        sbp->hintScoreChanged(existMTaskEdgep);
+                    }
+                }
+            } else {
+                // No existing edge into *this, make one.
+                MTaskEdge* newEdgep;
+                if (way == GraphWay::REVERSE) {
+                    newEdgep = new MTaskEdge(mtasksp, tedgep->fromMTaskp(),
+                                             recipientp, 1);
+                } else {
+                    newEdgep = new MTaskEdge(mtasksp, recipientp,
+                                             tedgep->toMTaskp(), 1);
+                }
+                if (sbp) sbp->addElem(newEdgep);
+            }
+        }
+    }
+}
+
+//######################################################################
+// PartContraction
+
+// Perform edge or sibling contraction on the partition graph
+class PartContraction {
+private:
+    // TYPES
+
+    // TODO: might get a little more speed by making this a
+    // vl_unordered_set and defining hash and equal_to functors for the
+    // SiblingMC:
+    typedef std::set<SiblingMC> SibSet;
+    typedef vl_unordered_set<const SiblingMC*> SibpSet;
+    typedef vl_unordered_map<const LogicMTask*, SibpSet> MTask2Sibs;
+
+    // New CP information for mtaskp reflecting an upcoming merge
+    struct NewCp {
+        uint32_t cp;
+        uint32_t propagateCp;
+        bool propagate;
+    };
+
+    // MEMBERS
+    V3Graph* m_mtasksp;  // Mtask graph
+    uint32_t m_scoreLimit;  // Sloppy score allowed when picking merges
+    uint32_t m_scoreLimitBeforeRescore;  // Next score rescore at
+    unsigned m_mergesSinceRescore;  // Merges since last rescore
+    bool m_slowAsserts;  // Take extra time to validate algorithm
+    V3Scoreboard<MergeCandidate, uint32_t> m_sb;  // Scoreboard
+    SibSet m_pairs;  // Storage for each SiblingMC
+    MTask2Sibs m_mtask2sibs;  // SiblingMC set for each mtask
+
+public:
+    // CONSTRUCTORS
+    PartContraction(V3Graph* mtasksp, uint32_t scoreLimit, bool slowAsserts)
+        : m_mtasksp(mtasksp)
+        , m_scoreLimit(scoreLimit)
+        , m_scoreLimitBeforeRescore(0xffffffff)
+        , m_mergesSinceRescore(0)
+        , m_slowAsserts(slowAsserts)
+        , m_sb(&mergeCandidateScore, slowAsserts) { }
+
+    // METHODS
+    void go() {
+        unsigned maxMTasks = v3Global.opt.threadsMaxMTasks();
+        if (maxMTasks == 0) {  // Unspecified so estimate
+            if (v3Global.opt.threads() > 1) {
+                maxMTasks = (PART_DEFAULT_MAX_MTASKS_PER_THREAD
+                             * v3Global.opt.threads());
+            } else {
+                // Running PartContraction with --threads <= 1 means self-test
+                maxMTasks = 500;
+            }
+        }
+
+        // OPTIMIZATION PASS: Edge contraction and sibling contraction.
+        //  - Score each pair of mtasks which is a candidate to merge.
+        //    * Each edge defines such a candidate pair
+        //    * Two mtasks that are prereqs or postreqs of a common third
+        //      vertex are "siblings", these are also a candidate pair.
+        //  - Build a list of MergeCandidates, sorted by score.
+        //  - Merge the best pair.
+        //  - Incrementally recompute critical paths near the merged mtask.
+
+        for (V3GraphVertex* itp = m_mtasksp->verticesBeginp(); itp;
+             itp = itp->verticesNextp()) {
+            vl_unordered_set<const V3GraphVertex*> neighbors;
+            for (V3GraphEdge* edgep = itp->outBeginp(); edgep;
+                 edgep=edgep->outNextp()) {
+                m_sb.addElem(MTaskEdge::cast(edgep));
+                if (neighbors.find(edgep->top()) != neighbors.end()) {
+                    itp->v3fatalSrc("Redundant edge found in input to PartContraction()");
+                }
+                neighbors.insert(edgep->top());
+            }
+            siblingPairFromRelatives(GraphWay::REVERSE, itp, true);
+            siblingPairFromRelatives(GraphWay::FORWARD, itp, true);
+        }
+
+        doRescore();  // Set initial scores in scoreboard
+
+        while (1) {
+            // This is the best edge to merge, with the lowest
+            // score (shortest local critical path)
+            MergeCandidate* mergeCanp = const_cast<MergeCandidate*>(m_sb.bestp());
+            if (!mergeCanp) {
+                // Scoreboard found no eligible merges. Maybe a rescore
+                // will produce some merge-able pairs?
+                if (m_sb.needsRescore()) {
+                    doRescore();
+                    continue;
+                }
+                break;
+            }
+
+            if (m_slowAsserts) {
+                UASSERT(!m_sb.needsRescore(mergeCanp),
+                        "Need-rescore items should not be returned by bestp");
+            }
+            uint32_t cachedScore = m_sb.cachedScore(mergeCanp);
+            uint32_t actualScore = mergeCandidateScore(mergeCanp);
+
+            if (actualScore > cachedScore) {
+                // Cached score is out-of-date.
+                // Mark this elem as in need of a rescore and continue.
+                m_sb.hintScoreChanged(mergeCanp);
+                continue;
+            }
+            // ... we'll also confirm that actualScore hasn't shrunk relative
+            // to cached score, after the mergeWouldCreateCycle() check.
+
+            if (actualScore > m_scoreLimit) {
+                // Our best option isn't good enough
+                if (m_sb.needsRescore()) {
+                    // Some pairs need a rescore, maybe those will be
+                    // eligible to merge afterward.
+                    doRescore();
+                    continue;
+                } else {
+                    // We've exhausted everything below m_scoreLimit; stop.
+
+                    // Except, if we have too many mtasks, raise the score
+                    // limit and keep going...
+                    unsigned mtaskCount = 0;
+                    for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp();
+                         vxp; vxp = vxp->verticesNextp()) {
+                        ++mtaskCount;
+                    }
+                    if (mtaskCount > maxMTasks) {
+                        uint32_t oldLimit = m_scoreLimit;
+                        m_scoreLimit = (m_scoreLimit * 120) / 100;
+
+                        // Line must be >0 otherwise FileLine doesn't check
+                        // if the warning is suppressed with -Wno-UNOPTTHREADS
+                        FileLine dummyFl("AstRoot", 1);
+                        dummyFl.v3warn(UNOPTTHREADS, "Thread scheduler is unable to provide requested parallelism; consider asking for fewer threads.");
+                        UINFO(1,"Critical path limit was="<<oldLimit
+                              <<" now="<<m_scoreLimit<<endl);
+                        continue;
+                    }
+                    // Really stop
+                    break;
+                }
+            }
+            if (actualScore > m_scoreLimitBeforeRescore) {
+                // Time to rescore, that will result in a higher
+                // scoreLimitBeforeRescore, and possibly lower-scoring
+                // elements returned from bestp().
+                doRescore();
+                continue;
+            }
+
+            // Avoid merging any edge that would create a cycle.
+            //
+            // For example suppose we begin with vertices A, B, C and edges
+            // A->B, B->C, A->C.
+            //
+            // Suppose we want to merge A->C into a single vertex.
+            // New edges would be AC->B and B->AC which is not a DAG.
+            // Do not allow this.
+            if (mergeCanp->mergeWouldCreateCycle()) {
+                // Remove this edge from scoreboard so we don't keep
+                // reconsidering it on every loop.
+                m_sb.removeElem(mergeCanp);
+                mergeCanp->removedFromSb(true);
+                continue;
+            }
+
+            partCheckCachedScoreVsActual(cachedScore, actualScore);
+
+            // Finally there's no cycle risk, no need to rescore, we're
+            // within m_scoreLimit and m_scoreLimitBeforeRescore.
+            // This is the edge to merge.
+            //
+            // Bookkeeping: if this is the first edge we'll merge since
+            // the last rescore, compute the new m_scoreLimitBeforeRescore
+            // to be somewhat higher than this edge's score.
+            if (m_mergesSinceRescore == 0) {
+#if PART_STEPPED_RESCORELIMIT
+                m_scoreLimitBeforeRescore = (actualScore * 105) / 100;
+#else
+                m_scoreLimitBeforeRescore = actualScore;
+#endif
+
+                // This print can serve as a progress indicator, as it
+                // increases from low numbers up toward cpLimit. It may be
+                // helpful to see progress during slow partitions. Maybe
+                // display something by default even?
+                UINFO(6, "New scoreLimitBeforeRescore: "
+                      <<m_scoreLimitBeforeRescore<<endl);
+            }
+
+            // Finally merge this candidate.
+            contract(mergeCanp);
+        }
+    }
+
+private:
+    NewCp newCp(GraphWay way, LogicMTask* mtaskp, LogicMTask* otherp,
+                MTaskEdge* mergeEdgep) {
+        // Return new wayward-CP for mtaskp reflecting its upcoming merge
+        // with otherp. Set 'result.propagate' if mtaskp's wayward
+        // relatives will see a new wayward CP from this merge.
+        uint32_t newCp;
+        if (mergeEdgep) {
+            if (mtaskp == mergeEdgep->furtherp(way)) {
+                newCp = std::max(otherp->critPathCost(way),
+                                 mtaskp->critPathCostWithout(way, mergeEdgep));
+            } else {
+                newCp = std::max(mtaskp->critPathCost(way),
+                                 otherp->critPathCostWithout(way, mergeEdgep));
+            }
+        } else {
+            newCp = std::max(otherp->critPathCost(way), mtaskp->critPathCost(way));
+        }
+
+        uint32_t origRelativesCp
+            = mtaskp->critPathCost(way) + mtaskp->stepCost();
+        uint32_t newRelativesCp
+            = newCp + LogicMTask::stepCost(mtaskp->cost() + otherp->cost());
+
+        NewCp result;
+        result.cp = newCp;
+        result.propagate = (newRelativesCp > origRelativesCp);
+        result.propagateCp = newRelativesCp;
+        return result;
+    }
+
+    void removeSiblingMCsWith(LogicMTask* mtaskp) {
+        for (SibpSet::iterator it = m_mtask2sibs[mtaskp].begin();
+             it != m_mtask2sibs[mtaskp].end(); ++it) {
+            const SiblingMC* pairp = *it;
+            if (!pairp->removedFromSb()) {
+                m_sb.removeElem(pairp);
+            }
+            LogicMTask* otherp = (pairp->bp() == mtaskp) ?
+                pairp->ap() : pairp->bp();
+            size_t erased = m_mtask2sibs[otherp].erase(pairp);
+            if (erased <= 0) otherp->v3fatalSrc("Expected existing mtask");
+            erased = m_pairs.erase(*pairp);
+            if (erased <= 0) mtaskp->v3fatalSrc("Expected existing mtask");
+        }
+        size_t erased = m_mtask2sibs.erase(mtaskp);
+        if (erased <= 0) mtaskp->v3fatalSrc("Expected existing mtask");
+    }
+
+    void contract(MergeCandidate* mergeCanp) {
+        LogicMTask *top = NULL;
+        LogicMTask *fromp = NULL;
+        MTaskEdge* mergeEdgep = dynamic_cast<MTaskEdge*>(mergeCanp);
+        SiblingMC* mergeSibsp = NULL;
+        if (mergeEdgep) {
+            top = dynamic_cast<LogicMTask*>(mergeEdgep->top());
+            fromp = dynamic_cast<LogicMTask*>(mergeEdgep->fromp());
+        } else {
+            mergeSibsp = dynamic_cast<SiblingMC*>(mergeCanp);
+            UASSERT(mergeSibsp,
+                    "Failed to cast mergeCanp to either MTaskEdge or SiblingMC");
+            top = mergeSibsp->ap();
+            fromp = mergeSibsp->bp();
+        }
+
+        // Merge the smaller mtask into the larger mtask.  If one of them
+        // is much larger, this will save time in partMergeEdgesFrom().
+        // Assume the more costly mtask has more edges.
+        //
+        // [TODO: now that we have edge maps, we could count the edges
+        //  exactly without a linear search.]
+        LogicMTask* recipientp;
+        LogicMTask* donorp;
+        if (fromp->cost() > top->cost()) {
+            recipientp = fromp;
+            donorp = top;
+        } else {
+            donorp = fromp;
+            recipientp = top;
+        }
+        fromp = top = NULL;  // Use donorp and recipientp now instead
+
+        // Recursively update forward and reverse CP numbers.
+        //
+        // Doing this before merging the mtasks lets us often avoid
+        // recursing through either incoming or outgoing edges on one or
+        // both mtasks.
+        //
+        // These 'NewCp' objects carry a bit indicating whether we must
+        // propagate CP for each of the four cases:
+        NewCp recipientNewCpFwd
+            = newCp(GraphWay::FORWARD, recipientp, donorp, mergeEdgep);
+        NewCp donorNewCpFwd
+            = newCp(GraphWay::FORWARD, donorp, recipientp, mergeEdgep);
+        NewCp recipientNewCpRev
+            = newCp(GraphWay::REVERSE, recipientp, donorp, mergeEdgep);
+        NewCp donorNewCpRev
+            = newCp(GraphWay::REVERSE, donorp, recipientp, mergeEdgep);
+
+        if (mergeEdgep) {
+            // Remove and free the connecting edge. Must do this before
+            // propagating CP's below.
+            m_sb.removeElem(mergeCanp);
+            mergeEdgep->unlinkDelete(); mergeEdgep=NULL;
+        }
+
+        // This also updates cost and stepCost on recipientp
+        recipientp->moveAllVerticesFrom(donorp);
+
+        UINFO(9, "recipient = "<<recipientp->id()
+              << ", donor = "<<donorp->id()
+              << ", mergeEdgep = "<<mergeEdgep
+              << "\n"
+              << "recipientNewCpFwd = "<<recipientNewCpFwd.cp
+              << (recipientNewCpFwd.propagate ? " true " : " false ")
+              << recipientNewCpFwd.propagateCp
+              << "\n"
+              << "donorNewCpFwd = "<<donorNewCpFwd.cp
+              << (donorNewCpFwd.propagate ? " true " : " false ")
+              << donorNewCpFwd.propagateCp
+              << endl);
+
+        LogicMTask::CpCostAccessor cpAccess;
+        PartPropagateCp<LogicMTask::CpCostAccessor>
+            forwardPropagator(m_mtasksp, GraphWay::FORWARD, &cpAccess, m_slowAsserts);
+        PartPropagateCp<LogicMTask::CpCostAccessor>
+            reversePropagator(m_mtasksp, GraphWay::REVERSE, &cpAccess, m_slowAsserts);
+
+        recipientp->setCritPathCost(GraphWay::FORWARD,
+                                    recipientNewCpFwd.cp);
+        if (recipientNewCpFwd.propagate) {
+            forwardPropagator.cpHasIncreased(recipientp, recipientNewCpFwd.propagateCp);
+        }
+        recipientp->setCritPathCost(GraphWay::REVERSE,
+                                    recipientNewCpRev.cp);
+        if (recipientNewCpRev.propagate) {
+            reversePropagator.cpHasIncreased(recipientp, recipientNewCpRev.propagateCp);
+        }
+        if (donorNewCpFwd.propagate) {
+            forwardPropagator.cpHasIncreased(donorp, donorNewCpFwd.propagateCp);
+        }
+        if (donorNewCpRev.propagate) {
+            reversePropagator.cpHasIncreased(donorp, donorNewCpRev.propagateCp);
+        }
+        forwardPropagator.go();
+        reversePropagator.go();
+
+        // Remove all SiblingMCs that include donorp. This Includes the one
+        // we're merging, if we're merging a SiblingMC.
+        removeSiblingMCsWith(donorp);
+        // Remove all SiblingMCs that include recipientp also, so we can't
+        // get huge numbers of SiblingMCs.  We'll recreate them below, up
+        // to a bounded number.
+        removeSiblingMCsWith(recipientp);
+
+        // Merge all edges
+        partMergeEdgesFrom(m_mtasksp, recipientp, donorp, &m_sb);
+
+        // Delete the donorp mtask from the graph
+        donorp->unlinkDelete(m_mtasksp); donorp = NULL;
+
+        m_mergesSinceRescore++;
+
+        // Do an expensive check, confirm we haven't botched the CP
+        // updates.
+        if (m_slowAsserts) partCheckCriticalPaths(m_mtasksp);
+
+        // Finally, make new sibling pairs as needed:
+        //  - prereqs and postreqs of recipientp
+        //  - prereqs of recipientp's postreqs
+        //  - postreqs of recipientp's prereqs
+        // Note that this depends on the updated critical paths (above).
+        siblingPairFromRelatives(GraphWay::REVERSE, recipientp, true);
+        siblingPairFromRelatives(GraphWay::FORWARD, recipientp, true);
+        unsigned edges = 0;
+        for (V3GraphEdge* edgep = recipientp->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            LogicMTask* postreqp = dynamic_cast<LogicMTask*>(edgep->top());
+            siblingPairFromRelatives(GraphWay::REVERSE, postreqp, false);
+            edges++;
+            if (edges > PART_SIBLING_EDGE_LIMIT) break;
+        }
+        edges = 0;
+        for (V3GraphEdge* edgep = recipientp->inBeginp();
+             edgep; edgep = edgep->inNextp()) {
+            LogicMTask* prereqp = dynamic_cast<LogicMTask*>(edgep->fromp());
+            siblingPairFromRelatives(GraphWay::FORWARD, prereqp, false);
+            edges++;
+            if (edges > PART_SIBLING_EDGE_LIMIT) break;
+        }
+    }
+
+    void doRescore() {
+        // During rescore, we know that graph isn't changing, so allow
+        // the critPathCost*Without() routines to cache some data in
+        // each LogicMTask. This is just an optimization, things should
+        // behave identically without the caching (just slower)
+
+        m_sb.rescore();
+        UINFO(6, "Did rescore. Merges since previous = "
+              << m_mergesSinceRescore << endl);
+
+        m_mergesSinceRescore = 0;
+        m_scoreLimitBeforeRescore = 0xffffffff;
+    }
+
+    static uint32_t mergeCandidateScore(const MergeCandidate* pairp) {
+        const MTaskEdge* edgep = dynamic_cast<const MTaskEdge*>(pairp);
+        if (edgep) {
+            // The '1 +' favors merging a SiblingMC over an otherwise-
+            // equal-scoring MTaskEdge. The comment on selfTest() talks
+            // about why.
+            return 1 + edgeScore(edgep);
+        }
+        const SiblingMC* sibsp = dynamic_cast<const SiblingMC*>(pairp);
+        if (sibsp) {
+            return siblingScore(sibsp);
+        }
+        v3fatalSrc("Failed to cast pairp to either MTaskEdge or SiblingMC in mergeCandidateScore");
+        return 0;
+    }
+
+    static uint32_t siblingScore(const SiblingMC* sibsp) {
+        LogicMTask* ap = sibsp->ap();
+        LogicMTask* bp = sibsp->bp();
+        uint32_t mergedCpCostFwd = std::max(ap->critPathCost(GraphWay::FORWARD),
+                                            bp->critPathCost(GraphWay::FORWARD));
+        uint32_t mergedCpCostRev = std::max(ap->critPathCost(GraphWay::REVERSE),
+                                            bp->critPathCost(GraphWay::REVERSE));
+        return mergedCpCostRev + mergedCpCostFwd
+            + LogicMTask::stepCost(ap->cost() + bp->cost());
+    }
+
+    static uint32_t edgeScore(const V3GraphEdge* edgep) {
+        // Score this edge. Lower is better. The score is the new local CP
+        // length if we merge these mtasks.  ("Local" means the longest
+        // critical path running through the merged node.)
+        LogicMTask* top = dynamic_cast<LogicMTask*>(edgep->top());
+        LogicMTask* fromp = dynamic_cast<LogicMTask*>(edgep->fromp());
+        uint32_t mergedCpCostFwd = std::max
+            (fromp->critPathCost(GraphWay::FORWARD),
+             top->critPathCostWithout(GraphWay::FORWARD, edgep));
+        uint32_t mergedCpCostRev = std::max
+            (fromp->critPathCostWithout(GraphWay::REVERSE, edgep),
+             top->critPathCost(GraphWay::REVERSE));
+        return mergedCpCostRev + mergedCpCostFwd
+            + LogicMTask::stepCost(fromp->cost() + top->cost());
+    }
+
+    void makeSiblingMC(LogicMTask* ap, LogicMTask *bp) {
+        SiblingMC newSibs(ap, bp);
+        std::pair<SibSet::iterator, bool> insertResult = m_pairs.insert(newSibs);
+        if (insertResult.second) {
+            const SiblingMC* newSibsp = &(*insertResult.first);
+            m_mtask2sibs[ap].insert(newSibsp);
+            m_mtask2sibs[bp].insert(newSibsp);
+            m_sb.addElem(newSibsp);
+        } else if (m_slowAsserts) {
+            // It's fine if we already have this SiblingMC, we may have
+            // created it earlier. Just confirm that we have associated data.
+            if (m_mtask2sibs.find(ap) == m_mtask2sibs.end()) {
+                ap->v3fatalSrc("Sibling not found");
+            }
+            if (m_mtask2sibs.find(bp) == m_mtask2sibs.end()) {
+                bp->v3fatalSrc("Sibling not found");
+            }
+            bool found = false;
+            for (SibpSet::iterator it = m_mtask2sibs[ap].begin();
+                 it != m_mtask2sibs[ap].end(); ++it) {
+                const SiblingMC* sibsp = *it;
+                if (!sibsp->removedFromSb() && !m_sb.contains(sibsp)) {
+                    ap->v3fatalSrc("One sibling must be the one we collided with");
+                }
+                if (   (sibsp->ap() == ap && sibsp->bp() == bp)
+                    || (sibsp->bp() == ap && sibsp->ap() == bp))
+                    found = true;
+            }
+            if (!found) ap->v3fatalSrc("Sibling not found");
+        }
+    };
+
+    static const GraphWay* s_shortestWaywardCpInclusiveWay;
+    static int shortestWaywardCpInclusive(const void* vap, const void* vbp) {
+        const GraphWay* wp = s_shortestWaywardCpInclusiveWay;
+        const LogicMTask* ap = *reinterpret_cast<const LogicMTask* const *>(vap);
+        const LogicMTask* bp = *reinterpret_cast<const LogicMTask* const *>(vbp);
+        uint32_t aCp = ap->critPathCost(*wp) + ap->stepCost();
+        uint32_t bCp = bp->critPathCost(*wp) + bp->stepCost();
+        if (aCp < bCp) { return -1; }
+        if (aCp > bCp) { return 1; }
+        if (ap->id() < bp->id()) { return -1; }
+        if (ap->id() > bp->id()) { return 1; }
+        return 0;
+    }
+
+    void siblingPairFromRelatives(GraphWay way, V3GraphVertex* mtaskp,
+                                  bool exhaustive) {
+        std::vector<LogicMTask*> shortestPrereqs;
+
+        for (V3GraphEdge* edgep = mtaskp->beginp(way);
+             edgep; edgep = edgep->nextp(way)) {
+            LogicMTask* prereqp = dynamic_cast<LogicMTask*>(edgep->furtherp(way));
+            shortestPrereqs.push_back(prereqp);
+            // Prevent nodes with huge numbers of edges from massively
+            // slowing down the partitioner:
+            if (shortestPrereqs.size() > PART_SIBLING_EDGE_LIMIT) break;
+        }
+
+        if (shortestPrereqs.empty()) return;
+
+        // qsort_r would be nice here, but it isn't portable
+        s_shortestWaywardCpInclusiveWay = &way;
+        qsort(&shortestPrereqs[0], shortestPrereqs.size(),
+              sizeof(LogicMTask*), &shortestWaywardCpInclusive);
+
+        // Don't make all NxN/2 possible pairs of prereqs, that's a lot
+        // to cart around. Just make a few pairs.
+        std::vector<LogicMTask*>::iterator it = shortestPrereqs.begin();
+        for (unsigned i = 0; exhaustive || (i < 3); ++i) {
+            if (it == shortestPrereqs.end()) break;
+            LogicMTask* ap = *(it++);
+            if (it == shortestPrereqs.end()) break;
+            LogicMTask* bp = *(it++);
+            makeSiblingMC(ap, bp);
+        }
+    }
+
+    // SELF TESTS
+
+    // This is a performance test, its intent is to demonstrate that the
+    // partitioner doesn't run on this chain in N^2 time or worse. Overall
+    // runtime should be N*log(N) for a chain-shaped graph.
+    //
+    static void selfTestChain() {
+        vluint64_t usecsSmall = partitionChainUsecs(5);
+        vluint64_t usecsLarge = partitionChainUsecs(500);
+        // Large input is 50x bigger than small input.
+        // Its runtime should be about 10x longer -- not about 2500x longer
+        // or worse which would suggest N^2 scaling or worse.
+        UASSERT(usecsLarge < (usecsSmall * 1500),
+                "selfTestChain() took longer than expected. Small input runtime = "
+                <<usecsSmall<<", large input runtime = "<<usecsLarge);
+    }
+
+    static vluint64_t partitionChainUsecs(unsigned chain_len) {
+        // NOTE: To get a dot file run with --debugi-V3Partition 4 or more.
+        vluint64_t startUsecs = V3Os::timeUsecs();
+        V3Graph mtasks;
+        LogicMTask* lastp = NULL;
+        for (unsigned i=0; i<chain_len; ++i) {
+            LogicMTask* mtp = new LogicMTask(&mtasks, NULL);
+            mtp->setCost(1);
+            if (lastp) {
+                new MTaskEdge(&mtasks, lastp, mtp, 1);
+            }
+            lastp = mtp;
+        }
+        partInitCriticalPaths(&mtasks);
+
+        // Since slowAsserts mode is *expected* to cause N^2 runtime, and the
+        // intent of this test is to demonstrate better-than-N^2 runtime, disable
+        // slowAsserts.
+        PartContraction ec(&mtasks,
+                           // Any CP limit >chain_len should work:
+                           chain_len * 2,
+                           false /* slowAsserts */);
+        ec.go();
+
+        PartParallelismEst check(&mtasks);
+        check.traverse();
+
+        vluint64_t endUsecs = V3Os::timeUsecs();
+        vluint64_t elapsedUsecs = endUsecs - startUsecs;
+
+        if (debug()>=6) {
+            UINFO(0, "Chain self test stats:\n");
+            check.debugReport();
+            UINFO(0, "Elapsed usecs = " << elapsedUsecs << "\n");
+        }
+
+        // All vertices should merge into one
+        UASSERT_SELFTEST(size_t, check.vertexCount(), 1);
+        return elapsedUsecs;
+    }
+
+    // This test defends against a particular failure mode that the
+    // partitioner exhibited during development:
+    //
+    // At one time, the partitioner consistently favored edge-merges over
+    // equal-scoring sibling merges. Every edge and sibling merge in this
+    // test starts out with an equal score. If you only do edge-merges, all
+    // possible merges will continue to have equal score as the center node
+    // grows and grows. Soon the critical path budget is exhausted by a
+    // large center node, and we still have many small leaf nodes -- it's
+    // literally the worst partition possible.
+    //
+    // Now, instead, the partitioner gives slight favoritism to sibling
+    // merges in the event that scores are tied. This is better for the
+    // test and also real designs.
+    static void selfTestX() {
+        // NOTE: To get a dot file run with --debugi-V3Partition 4 or more.
+        V3Graph mtasks;
+        LogicMTask* center = new LogicMTask(&mtasks, NULL);
+        center->setCost(1);
+        unsigned i;
+        for (i=0; i<50; ++i) {
+            LogicMTask* mtp = new LogicMTask(&mtasks, NULL);
+            mtp->setCost(1);
+            // Edge from every input -> center
+            new MTaskEdge(&mtasks, mtp, center, 1);
+        }
+        for (i=0; i<50; ++i) {
+            LogicMTask* mtp = new LogicMTask(&mtasks, NULL);
+            mtp->setCost(1);
+            // Edge from center -> every output
+            new MTaskEdge(&mtasks, center, mtp, 1);
+        }
+
+        partInitCriticalPaths(&mtasks);
+        PartContraction(&mtasks, 20, true).go();
+
+        PartParallelismEst check(&mtasks);
+        check.traverse();
+
+        // Checking exact values here is maybe overly precise.  What we're
+        // mostly looking for is a healthy reduction in the number of
+        // mtasks.
+        if (debug()>=5) {
+            UINFO(0, "X self test stats:\n");
+            check.debugReport();
+        }
+        UASSERT_SELFTEST(uint32_t, check.longestCritPathCost(), 19);
+        UASSERT_SELFTEST(uint32_t, check.totalGraphCost(), 101);
+        UASSERT_SELFTEST(uint32_t, check.vertexCount(), 14);
+        UASSERT_SELFTEST(uint32_t, check.edgeCount(), 13);
+    }
+public:
+    static void selfTest() {
+        selfTestX();
+        selfTestChain();
+    }
+
+private:
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(PartContraction);
+};
+
+const GraphWay* PartContraction::s_shortestWaywardCpInclusiveWay = NULL;
+
+//######################################################################
+// DpiImportCallVisitor
+
+// Scan node, indicate whether it contains a call to a DPI imported
+// routine.
+class DpiImportCallVisitor : public AstNVisitor {
+private:
+    bool m_hasDpiHazard;  // Found a DPI import call.
+    bool m_tracingCall;  // Iterating into a CCall to a CFunc
+    // METHODS
+    VL_DEBUG_FUNC;
+
+    virtual void visit(AstCFunc* nodep) {
+        if (!m_tracingCall) return;
+        m_tracingCall = false;
+        if (nodep->dpiImportWrapper()) {
+            if (nodep->pure() ? !v3Global.opt.threadsDpiPure()
+                : !v3Global.opt.threadsDpiUnpure()) {
+                m_hasDpiHazard = true;
+            }
+        }
+        iterateChildren(nodep);
+    }
+    virtual void visit(AstCCall* nodep) {
+        iterateChildren(nodep);
+        // Enter the function and trace it
+        m_tracingCall = true;
+        iterate(nodep->funcp());
+    }
+    virtual void visit(AstNode* nodep) {
+        iterateChildren(nodep);
+    }
+
+public:
+    // CONSTUCTORS
+    explicit DpiImportCallVisitor(AstNode* nodep)
+        : m_hasDpiHazard(false)
+        , m_tracingCall(false) {
+        iterate(nodep);
+    }
+    bool hasDpiHazard() const { return m_hasDpiHazard; }
+    virtual ~DpiImportCallVisitor() {}
+
+private:
+    VL_UNCOPYABLE(DpiImportCallVisitor);
+};
+
+//######################################################################
+// PartFixDataHazards
+
+// Fix data hazards in the partition graph.
+//
+// The fine-grained graph from V3Order may contain data hazards which are
+// not a problem for serial mode, but which would be a problem in parallel
+// mode.
+//
+// There are basically two classes: unordered pairs of writes, and
+// unordered write-read pairs. We fix both here, with a combination of
+// MTask-merges and new edges to ensure no such unordered pairs remain.
+//
+// ABOUT UNORDERED WRITE-WRITE PAIRS
+//
+//   The V3Order dependency graph treats these as unordered events:
+//
+//    a)  sig[15:8] = stuff;
+//          ...
+//    b)  sig[7:0]  = other_stuff;
+//
+//   Seems OK right? They are writes to disjoint bits of the same
+//   signal. They can run in either order, in serial mode, and the result
+//   will be the same.
+//
+//   The resulting C code for each of this isn't a pure write, it's
+//   actually an R-M-W sequence:
+//
+//    a)  sig = (sig & 0xff)   | (0xff00 & (stuff << 8));
+//          ...
+//    b)  sig = (sig & 0xff00) | (0xff & other_stuff);
+//
+//   In serial mode, order doesn't matter so long as these run serially.
+//   In parallel mode, we must serialize these RMW's to avoid a race.
+//
+//   We don't actually check here if each write would involve an R-M-W, we
+//   just assume that it would. If this routine ever causes a drastic
+//   increase in critical path, it could be optimized to make a better
+//   prediction (with all the risk that word implies!) about whether a
+//   given write is likely to turn into an R-M-W.
+//
+// ABOUT UNORDERED WRITE-READ PAIRS
+//
+//   If we don't put unordered write-read pairs into some order at verilation
+//   time, we risk a runtime race.
+//
+//   How do such unordered writer/reader pairs happen? Here's a partial list
+//   of scenarios:
+//
+//   Case 1: Circular logic
+//
+//     If the design has circular logic, V3Order has by now generated some
+//     dependency cycles, and also cut some of the edges to make it
+//     acyclic.
+//
+//     For serial mode, that was fine. We can break logic circles at an
+//     arbitrary point. At runtime, we'll repeat the _eval() until no
+//     changes are detected, which papers over the discarded dependency.
+//
+//     For parallel mode, this situation can lead to unordered reads and
+//     writes of the same variable, causing a data race. For example if the
+//     original code is this:
+//
+//       assign b = b | a << 2;
+//       assign out = b;
+//
+//     ... there's originally a dependency edge which records that 'b'
+//     depends on the first assign. V3Order may cut this edge, making the
+//     statements unordered. In serial mode that's fine, they can run in
+//     either order. In parallel mode it's a reader/writer race.
+//
+//   Case 2: Race Condition in Verilog Sources
+//
+//     If the input has races, eg. blocking assignments in always blocks
+//     that share variables, the graph at this point will contain unordered
+//     writes and reads (or unordered write-write pairs) reflecting that.
+//
+//   Case 3: Interesting V3Order Behavior
+//
+//     There's code in V3Order that explicitly avoids making a dependency
+//     edge from a clock-gater signal to the logic node that produces the
+//     clock signal. This leads to unordered reader/writer pairs in
+//     parallel mode.
+//
+class PartFixDataHazards {
+private:
+    // TYPES
+    typedef std::set<LogicMTask*, MTaskIdLessThan> LogicMTaskSet;
+    typedef std::map<uint32_t/*rank*/, LogicMTaskSet> TasksByRank;
+    typedef std::set<const OrderVarStdVertex*, OrderByPtrId&> OvvSet;
+    typedef vl_unordered_map<const OrderLogicVertex*, LogicMTask*> Olv2MTaskMap;
+
+    // MEMBERS
+    V3Graph* m_mtasksp;  // Mtask graph
+    Olv2MTaskMap m_olv2mtask;  // Map OrderLogicVertex to LogicMTask who wraps it
+    unsigned m_mergesDone;  // Number of MTasks merged. For stats only.
+public:
+    // CONSTRUCTORs
+    explicit PartFixDataHazards(V3Graph* mtasksp)
+        : m_mtasksp(mtasksp), m_mergesDone(0) {}
+    // METHODS
+private:
+    void findAdjacentTasks(OvvSet::iterator ovvIt, TasksByRank* tasksByRankp) {
+        // Find all writer tasks for this variable, group by rank.
+        for (V3GraphEdge* edgep = (*ovvIt)->inBeginp();
+             edgep; edgep = edgep->inNextp()) {
+            OrderLogicVertex* logicp = dynamic_cast<OrderLogicVertex*>(edgep->fromp());
+            if (!logicp) continue;
+            if (logicp->domainp()->hasInitial()
+                || logicp->domainp()->hasSettle()) continue;
+            LogicMTask* writerMtaskp = m_olv2mtask.at(logicp);
+            (*tasksByRankp)[writerMtaskp->rank()].insert(writerMtaskp);
+        }
+        // Find all reader tasks for this variable, group by rank.
+        for (V3GraphEdge* edgep = (*ovvIt)->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            OrderLogicVertex* logicp = dynamic_cast<OrderLogicVertex*>(edgep->fromp());
+            if (!logicp) continue;
+            if (logicp->domainp()->hasInitial()
+                || logicp->domainp()->hasSettle()) continue;
+            LogicMTask* readerMtaskp = m_olv2mtask.at(logicp);
+            (*tasksByRankp)[readerMtaskp->rank()].insert(readerMtaskp);
+        }
+    }
+    void mergeSameRankTasks(TasksByRank* tasksByRankp) {
+        LogicMTask* lastMergedp = NULL;
+        for (TasksByRank::iterator rankIt = tasksByRankp->begin();
+             rankIt != tasksByRankp->end(); ++rankIt) {
+            // Find the largest node at this rank, merge into it.  (If we
+            // happen to find a huge node, this saves time in
+            // partMergeEdgesFrom() versus merging into an arbitrary node.)
+            LogicMTask* mergedp = NULL;
+            for (LogicMTaskSet::iterator it = rankIt->second.begin();
+                 it != rankIt->second.end(); ++it) {
+                LogicMTask* mtaskp = *it;
+                if (mergedp) {
+                    if (mergedp->cost() < mtaskp->cost()) {
+                        mergedp = mtaskp;
+                    }
+                } else {
+                    mergedp = mtaskp;
+                }
+            }
+            rankIt->second.erase(mergedp);
+
+            while (!rankIt->second.empty()) {
+                LogicMTaskSet::iterator begin = rankIt->second.begin();
+                LogicMTask* donorp = *begin;
+                if (donorp == mergedp) donorp->v3fatalSrc("Donor can't be merged edge");
+                rankIt->second.erase(begin);
+                // Merge donorp into mergedp.
+                // Fix up the map, so donor's OLVs map to mergedp
+                for (LogicMTask::VxList::const_iterator tmvit =
+                         donorp->vertexListp()->begin();
+                     tmvit != donorp->vertexListp()->end(); ++tmvit) {
+                    MTaskMoveVertex* tmvp = *tmvit;
+                    OrderLogicVertex* logicp = tmvp->logicp();
+                    if (logicp) m_olv2mtask[logicp] = mergedp;
+                }
+                // Move all vertices from donorp to mergedp
+                mergedp->moveAllVerticesFrom(donorp);
+                // Move edges from donorp to recipientp
+                partMergeEdgesFrom(m_mtasksp, mergedp, donorp, NULL);
+                // Remove donorp from the graph
+                donorp->unlinkDelete(m_mtasksp); VL_DANGLING(donorp);
+                m_mergesDone++;
+            }
+
+            if (lastMergedp) {
+                if (lastMergedp->rank() >= mergedp->rank()) {
+                    mergedp->v3fatalSrc("Merging must be on lower rank");
+                }
+                if (!lastMergedp->hasRelative(GraphWay::FORWARD, mergedp)) {
+                    new MTaskEdge(m_mtasksp, lastMergedp, mergedp, 1);
+                }
+            }
+            lastMergedp = mergedp;
+        }
+    }
+    bool hasDpiHazard(LogicMTask* mtaskp) {
+        for (LogicMTask::VxList::const_iterator it = mtaskp->vertexListp()->begin();
+             it != mtaskp->vertexListp()->end(); ++it) {
+            if (!(*it)->logicp()) continue;
+            AstNode* nodep = (*it)->logicp()->nodep();
+            // NOTE: We don't handle DPI exports. If testbench code calls a
+            // DPI-exported function at any time during eval() we may have
+            // a data hazard. (Likewise in non-threaded mode if an export
+            // messes with an ordered variable we're broken.)
+
+            // Find all calls to DPI-imported functions, we can put those
+            // into a serial order at least. That should solve the most
+            // likely DPI-related data hazards.
+            if (DpiImportCallVisitor(nodep).hasDpiHazard()) {
+                return true;
+            }
+        }
+        return false;
+    }
+public:
+    void go() {
+        vluint64_t startUsecs = 0;
+        if (debug() >= 3) startUsecs = V3Os::timeUsecs();
+
+        // Build an OLV->mtask map and a set of OVVs
+        OrderByPtrId ovvOrder;
+        OvvSet ovvSet(ovvOrder);
+        // OVV's which wrap systemC vars will be handled slightly specially
+        OvvSet ovvSetSystemC(ovvOrder);
+
+        for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
+            // Should be only one MTaskMoveVertex in each mtask at this
+            // stage, but whatever, write it as a loop:
+            for (LogicMTask::VxList::const_iterator it
+                     = mtaskp->vertexListp()->begin();
+                 it != mtaskp->vertexListp()->end(); ++it) {
+                MTaskMoveVertex* tmvp = *it;
+                if (OrderLogicVertex* logicp = tmvp->logicp()) {
+                    m_olv2mtask[logicp] = mtaskp;
+                    // Look at downstream vars.
+                    for (V3GraphEdge *edgep = logicp->outBeginp();
+                         edgep; edgep = edgep->outNextp()) {
+                        // Only consider OrderVarStdVertex which reflects
+                        // an actual lvalue assignment; the others do not.
+                        OrderVarStdVertex* ovvp
+                            = dynamic_cast<OrderVarStdVertex*>(edgep->top());
+                        if (!ovvp) continue;
+                        if (ovvp->varScp()->varp()->isSc()) {
+                            ovvSetSystemC.insert(ovvp);
+                        } else {
+                            ovvSet.insert(ovvp);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Rank the graph.
+        // DGS is faster than V3GraphAlg's recursive rank, in the worst
+        // cases where the recursive rank must pass through the same node
+        // many times. (We saw 22s for DGS vs. 500s for recursive rank on
+        // one large design.)
+        {
+            GraphStreamUnordered serialize(m_mtasksp);
+            const V3GraphVertex* vertexp;
+            while ((vertexp = serialize.nextp())) {
+                uint32_t rank = 0;
+                for (V3GraphEdge* edgep = vertexp->inBeginp(); edgep;
+                     edgep = edgep->inNextp()) {
+                    rank = std::max(edgep->fromp()->rank() + 1, rank);
+                }
+                const_cast<V3GraphVertex*>(vertexp)->rank(rank);
+            }
+        }
+
+        // For each OrderVarVertex, look at its writer and reader mtasks.
+        //
+        // If there's a set of writers and readers at the same rank, we
+        // know these are unordered with respect to one another, so merge
+        // those mtasks all together.
+        //
+        // At this point, we have at most one merged mtask per rank (for a
+        // given OVV.) Create edges across these remaining mtasks to ensure
+        // they run in serial order (going along with the existing ranks.)
+        //
+        // NOTE: we don't update the CP's stored in the LogicMTasks to
+        // reflect the changes we make to the graph. That's OK, as we
+        // haven't yet initialized CPs when we call this routine.
+        for (OvvSet::iterator ovvit = ovvSet.begin();
+             ovvit != ovvSet.end(); ++ovvit) {
+            // Build a set of mtasks, per rank, which access this var.
+            // Within a rank, sort by MTaskID to avoid nondeterminism.
+            TasksByRank tasksByRank;
+
+            // Find all reader and writer tasks for this variable, add to
+            // tasksByRank.
+            findAdjacentTasks(ovvit, &tasksByRank);
+
+            // Merge all writer and reader tasks from same rank together.
+            //
+            // NOTE: Strictly speaking, we don't need to merge all the
+            // readers together. That may lead to extra serialization. The
+            // least amount of ordering we could impose here would be to
+            // merge all writers at a given rank together; then make edges
+            // from the merged writer node to each reader node at the same
+            // rank; and then from each reader node to the merged writer at
+            // the next rank.
+            //
+            // Whereas, merging all readers and writers at the same rank
+            // together is "the simplest thing that could possibly work"
+            // and it seems to.  It also creates fairly few edges. We don't
+            // want to create tons of edges here, doing so is not nice to
+            // the main edge contraction pass.
+            mergeSameRankTasks(&tasksByRank);
+        }
+
+        // Handle SystemC vars just a little differently. Instead of
+        // treating each var as an independent entity, and serializing
+        // writes to that one var, we treat ALL systemC vars as a single
+        // entity and serialize writes (and, conservatively, reads) across
+        // all of them.
+        //
+        // Reasoning: writing a systemC var actually turns into a call to a
+        // var.write() method, which under the hood is accessing some data
+        // structure that's shared by many SC vars. It's not thread safe.
+        //
+        // Hopefully we only have a few SC vars -- top level ports, probably.
+        {
+            TasksByRank tasksByRank;
+            for (OvvSet::iterator ovvit = ovvSetSystemC.begin();
+                 ovvit != ovvSetSystemC.end(); ++ovvit) {
+                findAdjacentTasks(ovvit, &tasksByRank);
+            }
+            mergeSameRankTasks(&tasksByRank);
+        }
+
+        // Handle nodes containing DPI calls, we want to serialize those
+        // by default unless user gave --threads-dpi-concurrent.
+        // Same basic strategy as above to serialize access to SC vars.
+        if (!v3Global.opt.threadsDpiPure() || !v3Global.opt.threadsDpiUnpure()) {
+            TasksByRank tasksByRank;
+            for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp();
+                 vxp; vxp = vxp->verticesNextp()) {
+                LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(vxp);
+                if (hasDpiHazard(mtaskp)) {
+                    tasksByRank[vxp->rank()].insert(mtaskp);
+                }
+            }
+            mergeSameRankTasks(&tasksByRank);
+        }
+
+        UINFO(4, "PartFixDataHazards() merged "<<m_mergesDone
+              <<" pairs of nodes in "<<(V3Os::timeUsecs() - startUsecs)
+              <<" usecs.\n");
+    }
+
+private:
+    VL_UNCOPYABLE(PartFixDataHazards);
+    VL_DEBUG_FUNC;
+};
+
+
+//######################################################################
+// PartPackMTasks
+
+// Statically pack tasks into threads.
+//
+// The simplest thing that could possibly work would be to assume that our
+// predictions of task runtimes are precise, and that every thread will
+// make progress at an equal rate. Simulate a single "clock", pack the the
+// highest priority ready task into whatever thread becomes ready earliest,
+// repeating until no tasks remain.
+//
+// That doesn't work well, as our predictions of task runtimes have wide
+// error bars (+/- 60% is typical.)
+//
+// So be a little more clever: let each task have a different end time,
+// depending on which thread is looking. Be a little bit pessimistic when
+// thread A checks the end time of an mtask running on thread B. This extra
+// "padding" avoids tight "layovers" at cross-thread dependencies.
+class PartPackMTasks {
+private:
+    // TYPES
+    struct MTaskState {
+        uint32_t completionTime;  // Estimated time this mtask will complete
+    };
+    struct MTaskCmp {
+        bool operator() (const ExecMTask* ap, ExecMTask* bp) const {
+            return ap->id() < bp->id();
+        }
+    };
+
+    // MEMBERS
+    V3Graph* m_mtasksp;  // Mtask graph
+    uint32_t m_nThreads;  // Number of threads
+    uint32_t m_sandbagNumerator;  // Numerator padding for est runtime
+    uint32_t m_sandbagDenom;  // Denomerator padding for est runtime
+
+    typedef vl_unordered_map<const ExecMTask*, MTaskState> MTaskStateMap;
+    MTaskStateMap m_mtaskState;  // State for each mtask.
+
+    MTaskCmp m_mtaskCmp;  // Comparison functor
+    typedef std::set<ExecMTask*, MTaskCmp&> ReadyMTasks;
+    ReadyMTasks m_ready;  // MTasks ready to be assigned next; all their
+    //                    // dependencies are already assigned.
+
+    typedef std::vector<ExecMTask*> MTaskVec;
+    MTaskVec m_prevMTask;  // Previous mtask scheduled to each thread.
+    std::vector<uint32_t> m_busyUntil;  // Time each thread is occupied until
+
+public:
+    // CONSTRUCTORS
+    PartPackMTasks(V3Graph* mtasksp,
+                   uint32_t nThreads = v3Global.opt.threads(),
+                   unsigned sandbagNumerator = 30,
+                   unsigned sandbagDenom = 100)
+        : m_mtasksp(mtasksp)
+        , m_nThreads(nThreads)
+        , m_sandbagNumerator(sandbagNumerator)
+        , m_sandbagDenom(sandbagDenom)
+        , m_ready(m_mtaskCmp) {}
+    ~PartPackMTasks() {}
+
+    // METHOS
+    uint32_t completionTime(const ExecMTask* mtaskp, uint32_t thread) {
+        const MTaskState& state = m_mtaskState[mtaskp];
+        UASSERT(mtaskp->thread() != 0xffffffff, "Mtask should have assigned thread");
+        if (thread == mtaskp->thread()) {
+            // No overhead on native thread
+            return state.completionTime;
+        }
+
+        // Add some padding to the estimated runtime when looking from
+        // another thread
+        uint32_t sandbaggedEndTime = state.completionTime
+            + (m_sandbagNumerator * mtaskp->cost()) / m_sandbagDenom;
+
+        // If task B is packed after task A on thread 0, don't let thread 1
+        // think that A finishes later than thread 0 thinks that B
+        // finishes, otherwise we get priority inversions and fail the self
+        // test.
+        if (mtaskp->packNextp()) {
+            uint32_t successorEndTime
+                = completionTime(mtaskp->packNextp(), mtaskp->thread());
+            if ((sandbaggedEndTime >= successorEndTime)
+                && (successorEndTime > 1)) {
+                sandbaggedEndTime = successorEndTime - 1;
+            }
+        }
+
+        UINFO(6, "Sandbagged end time for "<<mtaskp->name()
+              <<" on th "<<thread<<" = "<<sandbaggedEndTime<<endl);
+        return sandbaggedEndTime;
+    }
+
+    void setCompletionTime(ExecMTask* mtaskp, uint32_t time) {
+        MTaskState& state = m_mtaskState[mtaskp];
+        state.completionTime = time;
+    }
+
+    void go() {
+        // Build initial ready list
+        for (V3GraphVertex* vxp = m_mtasksp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(vxp);
+            if (vxp->inEmpty()) m_ready.insert(mtaskp);
+        }
+
+        m_prevMTask.clear();
+        m_prevMTask.resize(m_nThreads);
+        m_busyUntil.clear();
+        m_busyUntil.resize(m_nThreads);
+
+        while (!m_ready.empty()) {
+            // For each task in the ready set, compute when it might start
+            // on each thread (in that thread's local time frame.)
+            uint32_t bestTime = 0xffffffff;
+            uint32_t bestTh = 0;
+            ExecMTask* bestMtaskp = NULL;
+            for (uint32_t th = 0; th < m_nThreads; ++th) {
+                for (ReadyMTasks::iterator taskIt = m_ready.begin();
+                     taskIt != m_ready.end(); ++taskIt) {
+                    uint32_t timeBegin = m_busyUntil[th];
+                    if (timeBegin > bestTime) {
+                        UINFO(6, "th "<<th<<" busy until "<<timeBegin
+                              <<", later than bestTime "<<bestTime
+                              <<", skipping thread.\n");
+                        break;
+                    }
+                    ExecMTask* taskp = *taskIt;
+                    for (V3GraphEdge* edgep = taskp->inBeginp();
+                         edgep; edgep = edgep->inNextp()) {
+                        ExecMTask* priorp
+                            = dynamic_cast<ExecMTask*>(edgep->fromp());
+                        uint32_t priorEndTime = completionTime(priorp, th);
+                        if (priorEndTime > timeBegin) {
+                            timeBegin = priorEndTime;
+                        }
+                    }
+                    UINFO(6, "Task "<<taskp->name()
+                          <<" start at "<<timeBegin
+                          <<" on thread "<<th<<endl);
+                    if ((timeBegin < bestTime)
+                        || ((timeBegin == bestTime)
+                            && (taskp->priority() > bestMtaskp->priority()))) {
+                        bestTime = timeBegin;
+                        bestTh = th;
+                        bestMtaskp = taskp;
+                    }
+                }
+            }
+
+            UINFO(6, "Will schedule "<<bestMtaskp->name()
+                  <<" onto thread "<<bestTh<<endl);
+            uint32_t bestEndTime = bestTime + bestMtaskp->cost();
+            setCompletionTime(bestMtaskp, bestEndTime);
+
+            // Update the ready list
+            size_t erased = m_ready.erase(bestMtaskp);
+            if (erased <= 0) bestMtaskp->v3fatalSrc("Should have erased something?");
+            for (V3GraphEdge* edgeOutp = bestMtaskp->outBeginp();
+                 edgeOutp; edgeOutp = edgeOutp->outNextp()) {
+                ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgeOutp->top());
+
+                UASSERT(nextp->thread() == 0xffffffff,
+                        "Tasks after one being assigned should not be assigned yet");
+                // They also should not be ready yet, since they only now
+                // may become ready
+                if (m_ready.find(nextp) != m_ready.end()) {
+                    nextp->v3fatalSrc("Tasks after one being assigned should not be ready");
+                }
+                bool isReady = true;
+                for (V3GraphEdge* edgeInp = nextp->inBeginp();
+                     edgeInp; edgeInp = edgeInp->inNextp()) {
+                    ExecMTask* priorp = dynamic_cast<ExecMTask*>(edgeInp->fromp());
+                    if (priorp == bestMtaskp) continue;
+                    if (priorp->thread() == 0xffffffff) {
+                        // This prior is not assigned yet
+                        isReady = false;
+                    }
+                }
+                if (isReady) {
+                    m_ready.insert(nextp);
+                    UINFO(6, "Inserted "<<nextp->name()<<" into ready\n");
+                }
+            }
+
+            // Update the ExecMTask itself
+            if (m_prevMTask[bestTh]) {
+                m_prevMTask[bestTh]->packNextp(bestMtaskp);
+                UINFO(6, "Packing "<<bestMtaskp->name()
+                      <<" after "<<m_prevMTask[bestTh]->name()<<endl);
+            } else {
+                UINFO(6, "Marking "<<bestMtaskp->name()<<" as thread root\n");
+                bestMtaskp->threadRoot(true);
+            }
+            bestMtaskp->thread(bestTh);
+
+            // Update the thread state
+            m_prevMTask[bestTh] = bestMtaskp;
+            m_busyUntil[bestTh] = bestEndTime;
+        }
+    }
+
+    // SELF TEST
+    static void selfTest() {
+        V3Graph graph;
+        ExecMTask* t0 = new ExecMTask(&graph, NULL, 0);
+        t0->cost(1000);
+        t0->priority(1100);
+        ExecMTask* t1 = new ExecMTask(&graph, NULL, 1);
+        t1->cost(100);
+        t1->priority(100);
+        ExecMTask* t2 = new ExecMTask(&graph, NULL, 2);
+        t2->cost(100);
+        t2->priority(100);
+
+        new V3GraphEdge(&graph, t0, t1, 1);
+        new V3GraphEdge(&graph, t0, t2, 1);
+
+        PartPackMTasks packer(&graph,
+                              2,  // Threads
+                              3,  // Sandbag numerator
+                              10);  // Sandbag denom
+        packer.go();
+
+        UASSERT_SELFTEST(bool, t0->threadRoot(), true);
+        UASSERT_SELFTEST(uint32_t, t0->thread(), 0);
+        UASSERT_SELFTEST(const void*, t0->packNextp(), t1);
+
+        UASSERT_SELFTEST(uint32_t, t1->thread(), 0);
+        UASSERT_SELFTEST(bool, t1->threadRoot(), false);
+        UASSERT_SELFTEST(const void*, t1->packNextp(), NULL);
+
+        UASSERT_SELFTEST(uint32_t, t2->thread(), 1);
+        UASSERT_SELFTEST(bool, t2->threadRoot(), true);
+        UASSERT_SELFTEST(const void*, t2->packNextp(), NULL);
+
+        // On its native thread, we see the actual end time for t0:
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 0), 1000);
+        // On the other thread, we see a sandbagged end time which does not
+        // exceed the t1 end time:
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t0, 1), 1099);
+
+        // Actual end time on native thread:
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 0), 1100);
+        // Sandbagged end time seen on thread 1.  Note it does not compound
+        // with t0's sandbagged time; compounding caused trouble in
+        // practice.
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t1, 1), 1130);
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 0), 1229);
+        UASSERT_SELFTEST(uint32_t, packer.completionTime(t2, 1), 1199);
+    }
+
+private:
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(PartPackMTasks);
+};
+
+//######################################################################
+// V3Partition implementation
+
+void V3Partition::debugMTaskGraphStats(const V3Graph* graphp, const string& stage) {
+    if (!debug()) return;
+
+    UINFO(4, "\n");
+    UINFO(4, " Stats for "<<stage<<endl);
+    uint32_t mtaskCount = 0;
+    uint32_t totalCost = 0;
+    uint32_t mtaskCostHist[32]; memset(mtaskCostHist, 0, sizeof(mtaskCostHist));
+
+    for (const V3GraphVertex* mtaskp = graphp->verticesBeginp(); mtaskp;
+         mtaskp = mtaskp->verticesNextp()) {
+        ++mtaskCount;
+        uint32_t mtaskCost = dynamic_cast<const AbstractMTask*>(mtaskp)->cost();
+        totalCost += mtaskCost;
+
+        unsigned log2Cost = 0;
+        while (mtaskCost >>= 1) ++log2Cost;
+        UASSERT(log2Cost < 32, "log2Cost overflow in debugMTaskGraphStats");
+        ++mtaskCostHist[log2Cost];
+    }
+    UINFO(4, "  Total mtask cost = "<<totalCost<<"\n");
+    UINFO(4, "  Mtask count = "<<mtaskCount<<"\n");
+    UINFO(4, "  Avg cost / mtask = "
+          << ((mtaskCount > 0)
+              ? cvtToStr(totalCost / mtaskCount)
+              : "INF!") << "\n");
+    UINFO(4, "  Histogram of mtask costs:\n");
+    for (unsigned i = 0; i < 32; ++i) {
+        if (mtaskCostHist[i]) {
+            UINFO(4, "    2^"<<i<<": "<<mtaskCostHist[i]<<endl);
+            V3Stats::addStat("MTask graph, "+stage+", mtask cost 2^"
+                             +(i<10 ? " ":"")
+                             +cvtToStr(i), mtaskCostHist[i]);
+        }
+    }
+
+    if (mtaskCount < 1000) {
+        string filePrefix("ordermv_");
+        filePrefix += stage;
+        if (debug() >= 4) graphp->dumpDotFilePrefixedAlways(filePrefix);
+    }
+
+    // Look only at the cost of each mtask, neglect communication cost.
+    // This will show us how much parallelism we expect, assuming cache-miss
+    // costs are minor and the cost of running logic is the dominant cost.
+    PartParallelismEst vertexParEst(graphp);
+    vertexParEst.traverse();
+    vertexParEst.statsReport(stage);
+    if (debug()>=4) {
+        UINFO(0, "\n");
+        UINFO(0, "  Parallelism estimate for based on mtask costs:\n");
+        vertexParEst.debugReport();
+    }
+}
+
+// Print a hash of the shape of graphp.  If you are battling
+// nondeterminism, this can help to pinpoint where in the pipeline it's
+// creeping in.
+void V3Partition::hashGraphDebug(const V3Graph* graphp, const char* debugName) {
+    // Disabled when there are no nondeterminism issues in flight.
+    if (!v3Global.opt.debugNondeterminism()) return;
+
+    vl_unordered_map<const V3GraphVertex*, uint32_t> vx2Id;
+    unsigned id = 0;
+    for (const V3GraphVertex* vxp = graphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        vx2Id[vxp] = id++;
+    }
+    unsigned hash = 0;
+    for (const V3GraphVertex* vxp = graphp->verticesBeginp();
+         vxp; vxp = vxp->verticesNextp()) {
+        for (const V3GraphEdge* edgep = vxp->outBeginp();
+             edgep; edgep= edgep->outNextp()) {
+            const V3GraphVertex* top = edgep->top();
+            hash = vx2Id[top] + 31u * hash;  // The K&R hash function
+        }
+    }
+    UINFO(0, "Hash of shape (not contents) of "<<debugName
+          <<" = "<<cvtToStr(hash)<<endl);
+}
+
+void V3Partition::setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp) {
+    // Look at each mtask
+    for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp;
+         itp=itp->verticesNextp()) {
+        LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
+        const LogicMTask::VxList* vertexListp = mtaskp->vertexListp();
+
+        // For each logic vertex in this mtask, create an mtask-to-mtask
+        // edge based on the logic-to-logic edge.
+        for (LogicMTask::VxList::const_iterator vit = vertexListp->begin();
+             vit != vertexListp->end(); ++vit) {
+            for (V3GraphEdge* outp = (*vit)->outBeginp(); outp;
+                 outp = outp->outNextp()) {
+                UASSERT(outp->weight() > 0, "Mtask not assigned weight");
+                const MTaskMoveVertex* top
+                    = dynamic_cast<MTaskMoveVertex*>(outp->top());
+                UASSERT(top, "MoveVertex not associated to mtask");
+                Vx2MTaskMap::const_iterator it = vx2mtaskp->find(top);
+                UASSERT(it != vx2mtaskp->end(), "MTask map can't find id");
+                LogicMTask* otherMTaskp = it->second;
+                UASSERT(otherMTaskp, "NULL other Mtask");
+                if (otherMTaskp == mtaskp) mtaskp->v3fatalSrc("Would create a cycle edge");
+
+                // Don't create redundant edges.
+                if (mtaskp->hasRelative(GraphWay::FORWARD, otherMTaskp)) {
+                    continue;
+                }
+                new MTaskEdge(mtasksp, mtaskp, otherMTaskp, 1);
+            }
+        }
+    }
+}
+
+void V3Partition::go(V3Graph* mtasksp) {
+    // Called by V3Order
+    hashGraphDebug(m_fineDepsGraphp, "v3partition initial fine-grained deps");
+
+    // Create the first MTasks. Initially, each MTask just wraps one
+    // MTaskMoveVertex. Over time, we'll merge MTasks together and
+    // eventually each MTask will wrap a large number of MTaskMoveVertices
+    // (and the logic nodes therein.)
+    uint32_t totalGraphCost = 0;
+    {
+        // The V3InstrCount within LogicMTask will set user5 on each AST
+        // node, to assert that we never count any node twice.
+        AstUser5InUse inUser5;
+        Vx2MTaskMap vx2mtask;
+        for (V3GraphVertex* vxp = m_fineDepsGraphp->verticesBeginp();
+             vxp; vxp = vxp->verticesNextp()) {
+            MTaskMoveVertex* mtmvVxp = dynamic_cast<MTaskMoveVertex*>(vxp);
+            if (!mtmvVxp) vxp->v3fatalSrc("Every vertex here should be an MTaskMoveVertex");
+
+            LogicMTask* mtaskp = new LogicMTask(mtasksp, mtmvVxp);
+            vx2mtask[mtmvVxp] = mtaskp;
+
+            totalGraphCost += mtaskp->cost();
+        }
+
+        // Create the mtask->mtask dep edges based on vertex deps
+        setupMTaskDeps(mtasksp, &vx2mtask);
+    }
+
+    V3Partition::debugMTaskGraphStats(mtasksp, "initial");
+
+    // For debug: print out the longest critical path.  This allows us to
+    // verify that the costs look reasonable, that we aren't combining
+    // nodes that should probably be split, etc.
+    if (v3Global.opt.dumpTreeLevel(__FILE__) >= 3) {
+        LogicMTask::dumpCpFilePrefixed(mtasksp, "cp");
+    }
+
+    // Merge nodes that could present data hazards; see comment within.
+    {
+        PartFixDataHazards(mtasksp).go();
+        V3Partition::debugMTaskGraphStats(mtasksp, "hazards");
+        hashGraphDebug(mtasksp, "mtasksp after fixDataHazards()");
+    }
+
+    // Setup the critical path into and out of each node.
+    partInitCriticalPaths(mtasksp);
+    hashGraphDebug(mtasksp, "after partInitCriticalPaths()");
+
+    // Order the graph. We know it's already ranked from fixDataHazards()
+    // so we don't need to rank it again.
+    //
+    // On at least some models, ordering the graph here seems to help
+    // performance. (Why? Is it just triggering noise in a lucky direction?
+    // Is it just as likely to harm results?)
+    //
+    // More diversity of models that can build with --threads will
+    // eventually tell us. For now keep the order() so we don't forget
+    // about it, in case it actually helps.  TODO: get more data and maybe
+    // remove this later if it doesn't really help.
+    mtasksp->orderPreRanked();
+
+    int targetParFactor = v3Global.opt.threads();
+    if (targetParFactor < 2) {
+        v3fatalSrc("We should not reach V3Partition when --threads <= 1");
+    }
+
+    // Set cpLimit to roughly totalGraphCost / nThreads
+    //
+    // Actually set it a bit lower, by a hardcoded fudge factor. This
+    // results in more smaller mtasks, which helps reduce fragmentation
+    // when scheduling them.
+    unsigned fudgeNumerator = 3;
+    unsigned fudgeDenominator = 5;
+    uint32_t cpLimit = ((totalGraphCost * fudgeNumerator)
+                        / (targetParFactor * fudgeDenominator));
+    UINFO(4, "V3Partition set cpLimit = "<<cpLimit<<endl);
+
+    // Merge MTask nodes together, repeatedly, until the CP budget is
+    // reached.  Coarsens the graph, usually by several orders of
+    // magnitude.
+    //
+    // Some tests disable this, hence the test on threadsCoarsen().
+    // Coarsening is always enabled in production.
+    if (v3Global.opt.threadsCoarsen()) {
+        PartContraction(mtasksp, cpLimit,
+                        // --debugPartition is used by tests
+                        // to enable slow assertions.
+                        v3Global.opt.debugPartition()).go();
+        V3Partition::debugMTaskGraphStats(mtasksp, "contraction");
+    }
+    {
+        mtasksp->removeTransitiveEdges();
+        V3Partition::debugMTaskGraphStats(mtasksp, "transitive1");
+    }
+
+    // Reassign MTask IDs onto smaller numbers, which should be more stable
+    // across small logic changes.  Keep MTask IDs in the same relative
+    // order though, otherwise we break CmpLogicMTask for still-existing
+    // EdgeSet's that haven't destructed yet.
+    {
+        typedef std::set<LogicMTask*, LogicMTask::CmpLogicMTask> SortedMTaskSet;
+        SortedMTaskSet sorted;
+        for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp;
+             itp = itp->verticesNextp()) {
+            LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
+            sorted.insert(mtaskp);
+        }
+        uint32_t nextId = 1;
+        for (SortedMTaskSet::iterator it = sorted.begin();
+             it != sorted.end(); ++it) {
+            // We shouldn't perturb the sort order of the set, despite
+            // changing the IDs, they should all just remain in the same
+            // relative order. Confirm that:
+            UASSERT(nextId <= (*it)->id(), "Should only shrink MTaskIDs here");
+            UINFO(4, "Reassigning MTask id " << (*it)->id()
+                  << " to id " << nextId << "\n");
+            (*it)->id(nextId);
+            nextId++;
+        }
+    }
+
+    // Set color to indicate an mtaskId on every underlying MTaskMoveVertex.
+    for (V3GraphVertex* itp = mtasksp->verticesBeginp(); itp;
+         itp = itp->verticesNextp()) {
+        LogicMTask* mtaskp = dynamic_cast<LogicMTask*>(itp);
+        for (LogicMTask::VxList::const_iterator it
+                 = mtaskp->vertexListp()->begin();
+             it != mtaskp->vertexListp()->end(); ++it) {
+            MTaskMoveVertex* mvertexp = *it;
+            mvertexp->color(mtaskp->id());
+        }
+    }
+}
+
+void V3Partition::finalizeCosts(V3Graph* execMTaskGraphp) {
+    GraphStreamUnordered ser(execMTaskGraphp, GraphWay::REVERSE);
+
+    while (const V3GraphVertex* vxp = ser.nextp()) {
+        ExecMTask* mtp = dynamic_cast<ExecMTask*>(const_cast<V3GraphVertex*>(vxp));
+        uint32_t costCount = V3InstrCount::count(mtp->bodyp(), false);
+        mtp->cost(costCount);
+        mtp->priority(costCount);
+
+        // "Priority" is the critical path from the start of the mtask, to
+        // the end of the graph reachable from this mtask.  Given the
+        // choice among several ready mtasks, we'll want to start the
+        // highest priority one first, so we're always working on the "long
+        // pole"
+        for (V3GraphEdge* edgep = mtp->outBeginp();
+             edgep; edgep = edgep->outNextp()) {
+            ExecMTask* followp = dynamic_cast<ExecMTask*>(edgep->top());
+            if ((followp->priority() + mtp->cost()) > mtp->priority()) {
+                mtp->priority(followp->priority() + mtp->cost());
+            }
+        }
+    }
+
+    // Some MTasks may now have zero cost, eliminate those.
+    // (It's common for tasks to shrink to nothing when V3LifePost
+    // removes dly assignments.)
+    for (V3GraphVertex* vxp = execMTaskGraphp->verticesBeginp(); vxp; ) {
+        ExecMTask* mtp = dynamic_cast<ExecMTask*>(vxp);
+        vxp = vxp->verticesNextp();  // Advance before delete
+
+        // Don't rely on checking mtp->cost() == 0 to detect an empty task.
+        // Our cost-estimating logic is just an estimate. Instead, check
+        // the MTaskBody to see if it's empty. That's the source of truth.
+        AstMTaskBody* bodyp = mtp->bodyp();
+        if (!bodyp->stmtsp()) {  // Kill this empty mtask
+            UINFO(6, "Removing zero-cost "<<mtp->name()<<endl);
+            for (V3GraphEdge* inp = mtp->inBeginp();
+                 inp; inp = inp->inNextp()) {
+                for (V3GraphEdge* outp = mtp->outBeginp();
+                     outp; outp = outp->outNextp()) {
+                    new V3GraphEdge(execMTaskGraphp, inp->fromp(),
+                                    outp->top(), 1);
+                }
+            }
+            mtp->unlinkDelete(execMTaskGraphp); VL_DANGLING(mtp);
+            // Also remove and delete the AstMTaskBody, otherwise it would
+            // keep a dangling pointer to the ExecMTask.
+            bodyp->unlinkFrBack()->deleteTree(); VL_DANGLING(bodyp);
+        }
+    }
+
+    // Removing tasks may cause edges that were formerly non-transitive to
+    // become transitive. Also we just created new edges around the removed
+    // tasks, which could be transitive. Prune out all transitive edges.
+    {
+        execMTaskGraphp->removeTransitiveEdges();
+        V3Partition::debugMTaskGraphStats(execMTaskGraphp,
+                                          "transitive2");
+    }
+
+    // Record summary stats for final m_tasks graph.
+    // (More verbose stats are available with --debugi-V3Partition >= 3.)
+    PartParallelismEst parEst(execMTaskGraphp);
+    parEst.traverse();
+    parEst.statsReport("final");
+    if (debug() >= 3) {
+        UINFO(0,"  Final mtask parallelism report:\n");
+        parEst.debugReport();
+    }
+}
+
+void V3Partition::finalize() {
+    // Called by Verilator top stage
+    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
+    UASSERT(execGraphp, "Couldn't find AstExecGraph singleton.");
+
+    // Back in V3Order, we partitioned mtasks using provisional cost
+    // estimates. However, V3Order precedes some optimizations (notably
+    // V3LifePost) that can change the cost of logic within each mtask.
+    // Now that logic is final, recompute the cost and priority of each
+    // ExecMTask.
+    finalizeCosts(execGraphp->mutableDepGraphp());
+
+    // "Pack" the mtasks: statically associate each mtask with a thread,
+    // and determine the order in which each thread will runs its mtasks.
+    PartPackMTasks(execGraphp->mutableDepGraphp()).go();
+}
+
+void V3Partition::selfTest() {
+    PartPropagateCpSelfTest::selfTest();
+    PartPackMTasks::selfTest();
+    PartContraction::selfTest();
+}
diff --git a/src/V3Partition.h b/src/V3Partition.h
new file mode 100644
index 000000000..ae5606a8e
--- /dev/null
+++ b/src/V3Partition.h
@@ -0,0 +1,99 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
+//
+// Code available from: http://www.veripool.org/verilator
+//
+//*************************************************************************
+//
+// Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#ifndef _V3PARTITION_H_
+#define _V3PARTITION_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+#include <list>
+
+#include "V3Graph.h"
+#include "V3OrderGraph.h"
+
+class LogicMTask;
+typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
+
+//*************************************************************************
+/// V3Partition takes the fine-grained logic graph from V3Order and
+/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
+/// of which contains of set of the logic nodes from the fine-grained
+/// graph.
+
+class V3Partition {
+    // MEMBERS
+    V3Graph* m_fineDepsGraphp;  // Fine-grained dependency graph
+public:
+    // CONSTRUCTORS
+    explicit V3Partition(V3Graph* fineDepsGraphp)
+        : m_fineDepsGraphp(fineDepsGraphp) {}
+    ~V3Partition() {}
+
+    // METHODS
+
+    // Fill in the provided empty graph with AbstractLogicMTask's and their
+    // interdependencies.
+    void go(V3Graph* mtasksp);
+
+    static void selfTest();
+
+    // Print out a hash of the shape of graphp.  Only needed to debug the
+    // origin of some nondeterminism; otherwise this is pretty useless.
+    static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
+
+    // Print debug stats about graphp whose nodes must be AbstractMTask's.
+    static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
+
+    // Operate on the final ExecMTask graph, immediately prior to code
+    // generation time.
+    static void finalize();
+private:
+    static void finalizeCosts(V3Graph* execMTaskGraphp);
+    static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
+
+    VL_DEBUG_FUNC;  // Declare debug()
+    VL_UNCOPYABLE(V3Partition);
+};
+
+//*************************************************************************
+// Map a pointer into a id, for e.g. nodep to mtask mappings
+
+class PartPtrIdMap {
+private:
+    // TYPES
+    typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
+    // MEMBERS
+    mutable vluint64_t m_nextId;
+    mutable PtrMap m_id;
+public:
+    // CONSTRUCTORS
+    PartPtrIdMap() : m_nextId(0) {}
+    // METHODS
+    vluint64_t findId(const void* ptrp) const {
+        PtrMap::iterator it = m_id.find(ptrp);
+        if (it != m_id.end()) {
+            return it->second;
+        }
+        m_id[ptrp] = m_nextId;
+        return m_nextId++;
+    }
+};
+
+#endif  // Guard
diff --git a/src/V3PartitionGraph.h b/src/V3PartitionGraph.h
new file mode 100644
index 000000000..5ac29082a
--- /dev/null
+++ b/src/V3PartitionGraph.h
@@ -0,0 +1,108 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+// DESCRIPTION: Verilator: Threading's graph structures
+//
+// Code available from: http://www.veripool.org/verilator
+//
+//*************************************************************************
+//
+// Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#ifndef _V3PARTITIONGRAPH_H_
+#define _V3PARTITIONGRAPH_H_
+
+#include "config_build.h"
+#include "verilatedos.h"
+#include <list>
+
+#include "V3Graph.h"
+#include "V3OrderGraph.h"
+
+//*************************************************************************
+// MTasks and graph structures
+
+class AbstractMTask : public V3GraphVertex {
+public:
+    AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
+    virtual ~AbstractMTask() {}
+    virtual uint32_t id() const = 0;
+    virtual uint32_t cost() const = 0;
+};
+
+class AbstractLogicMTask : public AbstractMTask {
+public:
+    // TYPES
+    typedef std::list<MTaskMoveVertex*> VxList;
+    // CONSTRUCTORS
+    AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
+    virtual ~AbstractLogicMTask() {}
+    // METHODS
+    // Set of logic vertices in this mtask. Order is not significant.
+    virtual const VxList* vertexListp() const = 0;
+    virtual uint32_t id() const = 0;  // Unique id of this mtask.
+    virtual uint32_t cost() const = 0;
+};
+
+class ExecMTask : public AbstractMTask {
+private:
+    AstMTaskBody*       m_bodyp;     // Task body
+    uint32_t            m_id;        // Unique id of this mtask.
+    uint32_t            m_priority;  // Predicted critical path from the start of
+    // this mtask to the ends of the graph that are reachable from this
+    // mtask. In abstract time units.
+    uint32_t            m_cost;      // Predicted runtime of this mtask, in the same
+    // abstract time units as priority().
+    uint32_t            m_thread;    // Thread for static (pack_mtasks) scheduling,
+    // or 0xffffffff if not yet assigned.
+    const ExecMTask*    m_packNextp;  // Next for static (pack_mtasks) scheduling
+    bool                m_threadRoot;  // Is root thread
+    VL_UNCOPYABLE(ExecMTask);
+public:
+    ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
+        : AbstractMTask(graphp),
+          m_bodyp(bodyp),
+          m_id(id),
+          m_priority(0),
+          m_cost(0),
+          m_thread(0xffffffff),
+          m_packNextp(NULL),
+          m_threadRoot(false) {}
+    AstMTaskBody* bodyp() const { return m_bodyp; }
+    virtual uint32_t id() const { return m_id; }
+    uint32_t priority() const { return m_priority; }
+    void priority(uint32_t pri) { m_priority = pri; }
+    virtual uint32_t cost() const { return m_cost; }
+    void cost(uint32_t cost) { m_cost = cost; }
+    void thread(uint32_t thread) { m_thread = thread; }
+    uint32_t thread() const { return m_thread; }
+    void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
+    const ExecMTask* packNextp() const { return m_packNextp; }
+    bool threadRoot() const { return m_threadRoot; }
+    void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
+    string cFuncName() const {
+        // If this MTask maps to a C function, this should be the name
+        return string("__Vmtask")+"__"+cvtToStr(m_id);
+    }
+    string name() const { return string("mt")+cvtToStr(id()); }
+    void dump(std::ostream& str) const {
+        str <<name()<<"."<<((void*)this);
+        if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
+        if (thread() != 0xffffffff) str <<" th="<<thread();
+        if (threadRoot()) str <<" [ROOT]";
+        if (packNextp()) str <<" nx="<<packNextp()->name();
+    }
+};
+inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
+    rhs.dump(os); return os; }
+
+#endif  // Guard
diff --git a/src/V3Trace.cpp b/src/V3Trace.cpp
index 984c0fb6f..10c30a72c 100644
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@@ -182,6 +182,7 @@ private:
     AstNode*		m_chgSubParentp;// Which node has call to m_chgSubFuncp
     int			m_chgSubStmts;	// Statements under function being built
     AstVarScope*	m_activityVscp;	// Activity variable
+    uint32_t            m_activityNumber;  // Count of fields in activity variable
     uint32_t		m_code;		// Trace ident code# being assigned
     V3Graph		m_graph;	// Var/CFunc tracking
     TraceActivityVertex* m_alwaysVtxp;	// "Always trace" vertex
@@ -297,7 +298,7 @@ private:
 
     void assignActivity() {
 	// Select activity numbers and put into each CFunc vertex
-	uint32_t activityNumber = 1;	// Note 0 indicates "slow"
+        m_activityNumber = 1;  // Note 0 indicates "slow"
 	for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
 	    if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
 		if (!vvertexp->activityCodeValid()) {
@@ -306,17 +307,39 @@ private:
 			// This makes us need less activityNumbers and so speeds up the fast path.
 			vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
 		    } else {
-			vvertexp->activityCode(activityNumber++);
+                        vvertexp->activityCode(m_activityNumber++);
 		    }
 		}
 	    }
 	}
 
-	// Insert global variable
-	if (!activityNumber) activityNumber++;   // For simplicity, always create it
-	int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE;   // For tighter code; round to next 32 bit point.
-	AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
-				      "__Vm_traceActivity", VFlagBitPacked(), activityBits);
+        AstVar* newvarp;
+        if (v3Global.opt.mtasks()) {
+            // Create a vector of bytes, not bits, for the tracing vector,
+            // so that we can set them atomically without locking.
+            //
+            // TODO: It would be slightly faster to have a bit vector per
+            // chain of packed MTasks, but we haven't packed the MTasks yet.
+            // If we support fully threaded tracing in the future, it would
+            // make sense to improve this at that time.
+            AstNodeDType* newScalarDtp
+                = new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
+            v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
+            AstNodeDType* newArrDtp = new AstUnpackArrayDType(
+                m_chgFuncp->fileline(),
+                newScalarDtp,
+                new AstRange(m_chgFuncp->fileline(),
+                             VNumRange(m_activityNumber-1, 0, false)));
+            v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
+            newvarp = new AstVar(m_chgFuncp->fileline(),
+                                 AstVarType::MODULETEMP,
+                                  "__Vm_traceActivity", newArrDtp);
+        } else {
+            // For tighter code; round to next 32 bit point.
+            int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
+            newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
+                                 "__Vm_traceActivity", VFlagBitPacked(), activityBits);
+        }
 	m_topModp->addStmtp(newvarp);
 	AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
 	m_highScopep->addVarp(newvscp);
@@ -329,15 +352,23 @@ private:
 		    FileLine* fl = vvertexp->insertp()->fileline();
 		    uint32_t acode = vvertexp->activityCode();
 		    vvertexp->insertp()->addNextHere
-			(new AstAssign (fl,
-					new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
-						    acode, 1),
-					new AstConst (fl, AstConst::LogicTrue())));
+                        (new AstAssign(fl, selectActivity(fl, acode, true),
+                                       new AstConst(fl, AstConst::LogicTrue())));
 		}
 	    }
 	}
     }
 
+    AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
+        if (v3Global.opt.mtasks()) {
+            return new AstArraySel(
+                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
+        } else {
+            return new AstSel(
+                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
+        }
+    }
+
     AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
 	AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
 	funcp->slow(basep->slow());
@@ -453,8 +484,7 @@ private:
 		    AstNode* condp = NULL;
 		    for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
 			uint32_t acode = *csit;
-			AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
-						    acode, 1);
+                        AstNode* selp = selectActivity(fl, acode, false);
 			if (condp) condp = new AstOr (fl, condp, selp);
 			else condp = selp;
 		    }
@@ -473,11 +503,19 @@ private:
 
 	// Clear activity after tracing completes
 	FileLine* fl = m_chgFuncp->fileline();
-	AstNode* clrp = new AstAssign (fl,
-				       new AstVarRef(fl, m_activityVscp, true),
-				       new AstConst(fl, V3Number(fl, m_activityVscp->width())));
-	m_fullFuncp->addFinalsp(clrp->cloneTree(true));
-	m_chgFuncp->addFinalsp(clrp);
+        if (v3Global.opt.mtasks()) {
+            for (uint32_t i = 0; i < m_activityNumber; ++i) {
+                AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
+                                              new AstConst(fl, AstConst::LogicFalse()));
+                m_fullFuncp->addFinalsp(clrp->cloneTree(true));
+                m_chgFuncp->addFinalsp(clrp);
+            }
+        } else {
+            AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
+                                          new AstConst(fl, V3Number(fl, m_activityVscp->width())));
+            m_fullFuncp->addFinalsp(clrp->cloneTree(true));
+            m_chgFuncp->addFinalsp(clrp);
+        }
     }
 
     uint32_t assignDeclCode(AstTraceDecl* nodep) {
@@ -699,6 +737,7 @@ public:
 	m_chgSubFuncp = NULL;
 	m_chgSubParentp = NULL;
 	m_chgSubStmts = 0;
+        m_activityNumber = 0;
         m_code = 0;
         m_finding = false;
 	m_funcNum = 0;
diff --git a/src/Verilator.cpp b/src/Verilator.cpp
index 515af82b0..a39a45139 100644
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@@ -73,6 +73,7 @@
 #include "V3Param.h"
 #include "V3Parse.h"
 #include "V3ParseSym.h"
+#include "V3Partition.h"
 #include "V3PreShell.h"
 #include "V3Premit.h"
 #include "V3Reloop.h"
@@ -524,6 +525,14 @@ void process () {
 	V3EmitC::emitcSyms();
 	V3EmitC::emitcTrace();
     }
+    if (!v3Global.opt.xmlOnly()
+        && v3Global.opt.mtasks()) {
+        // Finalize our MTask cost estimates and pack the mtasks into
+        // threads. Must happen pre-EmitC which relies on the packing
+        // order. Must happen post-V3LifePost which changes the relative
+        // costs of mtasks.
+        V3Partition::finalize();
+    }
     if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
 	V3EmitC::emitc();
     }
@@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
     VHashSha1::selfTest();
     AstBasicDTypeKwd::selfTest();
     V3Graph::selfTest();
-    V3TSP::selfTest();
-    V3ScoreboardBase::selfTest();
+    if (v3Global.opt.debugSelfTest()) {
+        V3TSP::selfTest();
+        V3ScoreboardBase::selfTest();
+        V3Partition::selfTest();
+    }
 
     // Read first filename
     v3Global.readFiles();
diff --git a/test_regress/Makefile b/test_regress/Makefile
index 84ed472bb..8b559c0ce 100644
--- a/test_regress/Makefile
+++ b/test_regress/Makefile
@@ -44,7 +44,7 @@ endif
 
 .PHONY: test
 test:
-	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
+	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist
 
 ######################################################################
 
@@ -61,6 +61,9 @@ nc:
 vlt:
 	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
 
+vltmt:
+	$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
+
 ######################################################################
 
 random:
diff --git a/test_regress/driver.pl b/test_regress/driver.pl
index ce7bc815b..a98fe5a9f 100755
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@@ -45,6 +45,7 @@ our %All_Scenarios
        nc    => ["simulator", "nc"],
        vcs   => ["simulator", "vcs"],
        vlt   => ["simulator", "vlt_all", "vlt"],
+       vltmt => ["simulator", "vlt_all", "vltmt"],
     );
 
 #======================================================================
@@ -104,6 +105,7 @@ if (! GetOptions (
           "ms!"         => sub { $opt_scenarios{ms} = $_[1]; },
           "nc!"         => sub { $opt_scenarios{nc} = $_[1]; },
           "vlt!"        => sub { $opt_scenarios{vlt} = $_[1]; },
+          "vltmt!"      => sub { $opt_scenarios{vltmt} = $_[1]; },
           "vcs!"        => sub { $opt_scenarios{vcs} = $_[1]; },
           "<>"          => \&parameter,
     )) {
@@ -322,6 +324,7 @@ sub new {
     $self->{scenario} ||= "ghdl" if $self->{ghdl};
     $self->{scenario} ||= "vcs" if $self->{vcs};
     $self->{scenario} ||= "vlt" if $self->{vlt};
+    $self->{scenario} ||= "vltmt" if $self->{vltmt};
     $self->{scenario} ||= "nc" if $self->{nc};
     $self->{scenario} ||= "ms" if $self->{ms};
     $self->{scenario} ||= "iv" if $self->{iv};
@@ -407,6 +410,7 @@ sub new {
 	ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
 	# Verilator
 	vlt => 0,
+        vltmt => 0,
 	verilator_flags => ["-cc",
 			    "-Mdir $self->{obj_dir}",
 			    "-OD",  # As currently disabled unless -O3
@@ -420,7 +424,7 @@ sub new {
 	%$self};
     bless $self, $class;
 
-    $self->{vlt_all} = $self->{vlt};  # Any Verilator scenario
+    $self->{vlt_all} = $self->{vlt} || $self->{vltmt};  # Any Verilator scenario
 
     $self->{VM_PREFIX} ||= "V".$self->{name};
     $self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
@@ -593,6 +597,8 @@ sub compile_vlt_flags {
     unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
     unshift @verilator_flags, "--x-assign unique";  # More likely to be buggy
     unshift @verilator_flags, "--trace" if $opt_trace;
+    unshift @verilator_flags, "--threads 3" if $param{vltmt};
+    unshift @verilator_flags, "--debug-partition" if $param{vltmt};
     if (defined $opt_optimize) {
 	my $letters = "";
 	if ($opt_optimize =~ /[a-zA-Z]/) {
@@ -746,6 +752,11 @@ sub compile {
 	    return 1;
 	}
 
+        if ($self->{vltmt} && !$self->cfg_with_threaded) {
+            $self->skip("Test requires Verilator configured with threads\n");
+            return 1;
+        }
+
 	if (!$param{fails} && $param{verilator_make_gcc}
 	    && $param{make_main}) {
 	    $self->_make_main();
@@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.
 
 =item --vlt
 
-Run Verilator tests.  Default unless another scenario flag is provided.
+Run Verilator tests in single-threaded mode.  Default unless another scenario flag is provided.
+
+=item --vltmt
+
+Run Verilator tests in multithreaded mode.
 
 =back
 
diff --git a/test_regress/t/t_a_selftest.pl b/test_regress/t/t_a_selftest.pl
new file mode 100755
index 000000000..a4290143b
--- /dev/null
+++ b/test_regress/t/t_a_selftest.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vlt_all => 1);
+
+top_filename("t/t_EXAMPLE.v");
+
+compile(
+    verilator_flags2 => ['--debug-self-test'],
+    verilator_make_gcc => 0,
+    make_top_shell => 0,
+    make_main => 0,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_case_huge.pl b/test_regress/t/t_case_huge.pl
index 664c4c566..20fdf61e5 100755
--- a/test_regress/t/t_case_huge.pl
+++ b/test_regress/t/t_case_huge.pl
@@ -15,7 +15,8 @@ compile(
 
 if ($Self->{vlt_all}) {
     file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 0 : 8));
 }
 
 execute(
diff --git a/test_regress/t/t_dpi_threads.pl b/test_regress/t/t_dpi_threads.pl
new file mode 100755
index 000000000..51bd34df3
--- /dev/null
+++ b/test_regress/t/t_dpi_threads.pl
@@ -0,0 +1,21 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2018 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+compile(
+    v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_dpi_threads.v b/test_regress/t/t_dpi_threads.v
new file mode 100644
index 000000000..5f982b09b
--- /dev/null
+++ b/test_regress/t/t_dpi_threads.v
@@ -0,0 +1,62 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// Copyright 2018 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License
+// Version 2.0.
+
+import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
+import "DPI-C" dpii_failure = function int \$dpii_failure ();
+
+module t (clk);
+   input clk;
+   integer cyc;
+   integer failure;
+
+   initial cyc = 0;
+
+`ifndef verilator
+   `error "Only Verilator supports PLI-ish DPI calls."
+`endif
+
+   always @ (posedge clk) begin
+      if (cyc == 2) begin
+         failure = $dpii_failure();
+         $write("* failure = %0d\n", failure);
+         if (failure > 0) begin
+            $stop;
+         end
+         $write("*-* All Finished *-*\n");
+         $finish;
+      end
+      cyc <= cyc + 1;
+   end
+
+   // The purpose of this test is to confirm that the DPI-call serialization
+   // code in V3Partition does ensure that these DPI calls do not run
+   // concurrently.
+   //
+   // Alternatively, the test may be run with "--threads-dpi all" in which case
+   // it should confirm that the calls do run concurrently and do detect a
+   // collision (they should, if the test is set up right.)  This is
+   // t_dpi_threads_collide.pl.
+   //
+   // Q) Is it a risk that the partitioner will merge or serialize these always
+   //    blocks, just by luck, even if the DPI-call serialization code fails?
+   //
+   // A) Yes, that's why t_dpi_threads_collide.pl also passes
+   //    --no-threads-do-coaren to disable MTask coarsening.  This ensures that
+   //    the MTask graph at the end of FixDataHazards (where we resolve DPI
+   //    hazards) is basically the final MTasks graph, and that data hazards
+   //    which persist beyond FixDataHazards should persist in the final
+   //    generated C code.
+
+   always @ (posedge clk) begin
+      $dpii_sys();
+   end
+
+   always @ (posedge clk) begin
+      $dpii_sys();
+   end
+
+endmodule
diff --git a/test_regress/t/t_dpi_threads_c.cpp b/test_regress/t/t_dpi_threads_c.cpp
new file mode 100644
index 000000000..814a8ce28
--- /dev/null
+++ b/test_regress/t/t_dpi_threads_c.cpp
@@ -0,0 +1,78 @@
+// -*- mode: C++; c-file-style: "cc-mode" -*-
+//*************************************************************************
+//
+// Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
+// redistribute it and/or modify it under the terms of either the GNU
+// Lesser General Public License Version 3 or the Perl Artistic License.
+// Version 2.0.
+//
+// Verilator is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+//*************************************************************************
+
+#include <atomic>
+#include <cstdio>
+#include <iostream>
+#include <unistd.h>
+#include "svdpi.h"
+
+//======================================================================
+
+#if defined(VERILATOR)
+# ifdef T_DPI_THREADS_COLLIDE
+#  include "Vt_dpi_threads_collide__Dpi.h"
+# else
+#  include "Vt_dpi_threads__Dpi.h"
+# endif
+#elif defined(VCS)
+# include "../vc_hdrs.h"
+#elif defined(CADENCE)
+# define NEED_EXTERNS
+#else
+# error "Unknown simulator for DPI test"
+#endif
+
+#ifdef NEED_EXTERNS
+extern "C" {
+    extern void dpii_sys_task();
+    extern int dpii_failure();
+}
+#endif
+
+//======================================================================
+
+struct state {
+    std::atomic<bool> task_is_running;
+    std::atomic<int> failure;
+    state() : task_is_running(false)
+            , failure(false) {}
+};
+
+static state st;
+
+void dpii_sys_task() {
+    bool other_task_running = atomic_exchange(&st.task_is_running, true);
+    if (other_task_running) {
+        // Another task is running. This is a collision.
+        st.failure = 1;
+        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
+    } else {
+        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
+    }
+
+    // Spend some time in the DPI call, so that if we can have a collision
+    // we probably will. Technically this is not guaranteed to detect every
+    // race. However, one second is so much greater than the expected
+    // runtime of everything else in the test, it really should pick up on
+    // races just about all of the time.
+    sleep(1);
+
+    atomic_exchange(&st.task_is_running, false);
+}
+
+int dpii_failure() {
+    return st.failure;
+}
diff --git a/test_regress/t/t_dpi_threads_collide.pl b/test_regress/t/t_dpi_threads_collide.pl
new file mode 100755
index 000000000..9b9c8731b
--- /dev/null
+++ b/test_regress/t/t_dpi_threads_collide.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2018 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_dpi_threads.v");
+
+compile(
+    v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
+    );
+
+# Similar to t_dpi_threads, which confirms that Verilator can prevent a
+# race between DPI import calls, this test confirms that the race exists
+# and that the DPI C code can detect it under --threads-dpi all
+# mode.
+#
+execute(
+    fails => 1,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_emit_memb_limit.pl b/test_regress/t/t_emit_memb_limit.pl
index 619a0cbc9..75034312c 100755
--- a/test_regress/t/t_emit_memb_limit.pl
+++ b/test_regress/t/t_emit_memb_limit.pl
@@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
 compile(
     verilator_flags2=>["-x-assign fast --x-initial fast",
                        "-Wno-UNOPTTHREADS",
-    ],
+                       # The slow V3Partition asserts are just too slow
+                       # in this test. They're disabled just for performance
+                       # reasons:
+                       "--no-debug-partition"],
     );
 
 execute(
diff --git a/test_regress/t/t_gantt.pl b/test_regress/t/t_gantt.pl
new file mode 100755
index 000000000..e5fe71c54
--- /dev/null
+++ b/test_regress/t/t_gantt.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+use IO::File;
+
+# Test for bin/verilator_gantt,
+#
+# Only needed in multithreaded regression.
+scenarios(vltmt => 1);
+
+# It doesn't really matter what test
+# we use, so long as it runs several cycles,
+# enough for the profiling to happen:
+top_filename("t/t_gen_alw.v");
+
+compile(
+    v_flags2 => ["--prof-threads"]
+    );
+
+execute(
+    all_run_flags => ["+verilator+prof+threads+start+2",
+                      " +verilator+prof+threads+window+2",
+                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      ],
+    check_finished => 1,
+    );
+
+# For now, verilator_gantt still reads from STDIN
+#  (probably it should take a file, gantt.dat like verilator_profcfunc)
+# The profiling data still goes direct to the runtime's STDOUT
+#  (maybe that should go to a separate file - gantt.dat?)
+run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
+            "$Self->{obj_dir}/profile_threads.dat",
+            "--vcd $Self->{obj_dir}/profile_threads.vcd",
+            "> $Self->{obj_dir}/gantt.log"]);
+
+# We should have three lines of gantt chart, each with
+# an even number of mtask-bars (eg "[123--]")
+my $gantt_line_ct = 0;
+my $global_mtask_ct = 0;
+{
+    my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
+        or error("$! $Self->{obj_dir}/gantt.log");
+    while (my $line = ($fh && $fh->getline)) {
+        if ($line !~ m/^  t:/) { next; }
+        $gantt_line_ct++;
+        my $this_thread_mtask_ct = 0;
+        my @mtasks = split(/\[/, $line);
+        shift @mtasks; # throw the '>>  ' away
+        foreach my $mtask (@mtasks) {
+            # Format of each mtask is "[123--]" where the hyphens
+            # number or ] may or may not appear; it depends on exact timing.
+            $this_thread_mtask_ct++;
+            $global_mtask_ct++;
+        }
+        if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
+    }
+}
+if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
+if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
+print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
+    if $Self->{verbose};
+
+# Diff to itself, just to check parsing
+vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
+
+ok(1);
+1;
diff --git a/test_regress/t/t_gate_tree.pl b/test_regress/t/t_gate_tree.pl
index 78ab1582b..7ceb6e365 100755
--- a/test_regress/t/t_gate_tree.pl
+++ b/test_regress/t/t_gate_tree.pl
@@ -117,6 +117,10 @@ compile(
     );
 
 execute(
+    all_run_flags => ["+verilator+prof+threads+start+100",
+                      " +verilator+prof+threads+window+2",
+                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
+                      ],
     check_finished => 1,
     );
 
diff --git a/test_regress/t/t_help.pl b/test_regress/t/t_help.pl
index 057320634..c53b67f7b 100755
--- a/test_regress/t/t_help.pl
+++ b/test_regress/t/t_help.pl
@@ -13,6 +13,7 @@ foreach my $prog (
     "../bin/verilator",
     "../bin/verilator_coverage",
     "../bin/verilator_difftree",
+    "../bin/verilator_gantt",
     "../bin/verilator_profcfunc",
     ) {
     run(fails => 1,
diff --git a/test_regress/t/t_inst_tree_inl0_pub1.pl b/test_regress/t/t_inst_tree_inl0_pub1.pl
index 88b3b2d5a..f72a90a3d 100755
--- a/test_regress/t/t_inst_tree_inl0_pub1.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1.pl
@@ -38,7 +38,8 @@ sub checkRelativeRefs {
 if ($Self->{vlt_all}) {
     # We expect to combine sequent functions across multiple instances of
     # l2, l3, l4, l5. If this number drops, please confirm this has not broken.
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 84 : 52));
 
     # Expect absolute refs in CFuncs for t (top module) and l1 (because it
     # has only one instance)
diff --git a/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl b/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
index c35acde3d..448b1412f 100755
--- a/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
@@ -18,7 +18,8 @@ compile(
 if ($Self->{vlt_all}) {
     # Fewer optimizations than t_inst_tree_inl0_pub1 which allows
     # relative CFuncs:
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
+               ($Self->{vltmt} ? 0 : 31));
 
     # Should not find any 'this->' except some 'this->__VlSymsp'
     my @files = `ls $Self->{obj_dir}/*.cpp`;
diff --git a/test_regress/t/t_threads_counter_1.pl b/test_regress/t/t_threads_counter_1.pl
index 2f4f64743..bc7ea206a 100755
--- a/test_regress/t/t_threads_counter_1.pl
+++ b/test_regress/t/t_threads_counter_1.pl
@@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 
-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);
 
 top_filename("t/t_threads_counter.v");
 
diff --git a/test_regress/t/t_threads_counter_2.pl b/test_regress/t/t_threads_counter_2.pl
index e016b8253..c5b325ae6 100755
--- a/test_regress/t/t_threads_counter_2.pl
+++ b/test_regress/t/t_threads_counter_2.pl
@@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 
-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);
 
 top_filename("t/t_threads_counter.v");
 
diff --git a/test_regress/t/t_threads_counter_4.pl b/test_regress/t/t_threads_counter_4.pl
new file mode 100755
index 000000000..3f8fab395
--- /dev/null
+++ b/test_regress/t/t_threads_counter_4.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_threads_counter.v");
+
+compile(
+    verilator_flags2 => ['--cc --threads 4'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_threads_nondeterminism.pl b/test_regress/t/t_threads_nondeterminism.pl
new file mode 100755
index 000000000..c37d9bd1f
--- /dev/null
+++ b/test_regress/t/t_threads_nondeterminism.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+scenarios(vltmt => 1);
+
+top_filename("t/t_threads_counter.v");
+
+compile(
+    verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
+    );
+
+execute(
+    check_finished => 1,
+    );
+
+file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
+
+ok(1);
+1;
diff --git a/test_regress/t/t_verilated_all.pl b/test_regress/t/t_verilated_all.pl
index 7c59c1fa3..ebd9e713a 100755
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@@ -13,7 +13,12 @@ my $root = "..";
 
 compile(
     # Can't use --coverage and --savable together, so cheat and compile inline
-    verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
+    verilator_flags2 => ["--cc",
+                         "--coverage-toggle --coverage-line --coverage-user",
+                         "--trace --vpi ",
+                         ($Self->cfg_with_threaded
+                          ? "--threads 2 $root/include/verilated_threads.cpp" : ""),
+                         "$root/include/verilated_save.cpp"],
     );
 
 execute(
@@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {
 
 foreach my $file (sort keys %hit) {
     if (!$hit{$file}
-        && $file !~ /_sc/) {
+        && $file !~ /_sc/
+        && ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
         error("Include file not covered by t_verilated_all test: ",$file);
     }
 }
diff --git a/test_regress/t/t_verilated_threaded.pl b/test_regress/t/t_verilated_threaded.pl
index a61548dbc..6d49e77af 100755
--- a/test_regress/t/t_verilated_threaded.pl
+++ b/test_regress/t/t_verilated_threaded.pl
@@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 
-scenarios(simulator => 1);
-$Self->cfg_with_threaded or skip("No thread support");
+scenarios(vltmt => 1);
 
 top_filename("t/t_verilated_all.v");