MAJOR: Add multithreaded model generation.

2018-07-22 20:54:28 -04:00 · 2018-07-22 20:54:28 -04:00 · ec8dbbffed
parent 0070520edb
commit ec8dbbffed
48 changed files with 5949 additions and 71 deletions
--- a/7
+++ b/7
@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
 * Verilator 4.000 devel
 **    This is a major release.  Any patches may require major rework to apply.
      [Thanks everyone]
 **    Add multithreaded model generation.
 **    Add runtime arguments.
 **    Fix internals to be C++ null-pointer-check clean.
 ***   Better optimize large always block splitting, bug1244. [John Coiner]
 ***   Add new reloop optimization for repetitive assignment compression.
 ****  Fix internals to be C++ null-pointer-check clean.
 ****  Fix internals to avoid 'using namespace std'.
 ****  Fix Verilation performance issues, bug1316. [John Coiner]
--- a/Makefile.in
+++ b/Makefile.in
@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 	bin/verilator \
 	bin/verilator_coverage \
 	bin/verilator_difftree \
 	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	doxygen-mainpage doxygen.config veripool-logo.png \
@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
 INST_PROJ_FILES = \
 	bin/verilator \
 	bin/verilator_coverage \
 	bin/verilator_gantt \
 	bin/verilator_includer \
 	bin/verilator_profcfunc \
 	include/verilated.mk \
@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile
 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
 VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
-	verilator_coverage verilator_includer verilator_profcfunc
+	verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
 # Some scripts go into both the search path and pkgdatadir,
 # so they can be found by the user, and under $VERILATOR_ROOT.
 # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
-VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
+VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1
 VL_INST_INC_BLDDIR_FILES = \
 	include/verilated_config.h \
@ -295,6 +297,7 @@ installbin:
 	$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
 	( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
--- a/bin/verilator
+++ b/bin/verilator
@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
    --pipe-filter <command>     Filter all input through a script
    --prefix <topname>          Name of top level class
    --prof-cfuncs               Name functions for profiling
    --prof-threads              Enable generating gantt chart data for threads
    --private                   Debugging; see docs
    --public                    Debugging; see docs
     -pvalue+<name>=<value>     Overwrite toplevel parameter
@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
    --stats-vars                Provide statistics on variables
     -sv                        Enable SystemVerilog parsing
     +systemverilogext+<ext>    Synonym for +1800-2017ext+<ext>
    --threads <threads>         Enable multithreading
    --threads-dpi <mode>        Enable multithreaded DPI
    --threads-max-mtasks <mtasks>  Tune maximum mtask partitioning
    --top-module <topname>      Name of top level input module
    --trace                     Enable waveform creation
    --trace-depth <levels>      Depth of tracing
@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
     +verilator+debug                  Enable debugging
     +verilator+debugi+<value>         Enable debugging at a level
     +verilator+help                   Display help
     +verilator+prof+threads+file+I<filename>  Set profile filename
     +verilator+prof+threads+start+I<value>    Set profile starting point
     +verilator+prof+threads+window+I<value>   Set profile duration
     +verilator+rand+reset+<value>     Set random reset technique
     +verilator+V                      Verbose version and config
     +verilator+version                Show version and exit
@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from.  This allows gprof
 or oprofile reports to be correlated with the original Verilog source
 statements. See also L<verilator_profcfunc>.
 =item --prof-threads
 Enable gantt chart data collection for threaded builds.
 Verilator will record the start and end time of each macro-task across a
 number of calls to eval. (What is a macro-task? See the Verilator internals
 document.)
 When profiling is enabled, the runtime will emit a blurb of profiling data
 in non-human-friendly form. The C<verilator_gantt> script will transform
 this into a nicer visual format and produce some related statistics.
 =item --private
 Opposite of --public.  Is the default; this option exists for backwards
@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.
 The user code must create a VerilatedSerialize or VerilatedDeserialze
 object then calling the << or >> operators on the generated model and any
-other data the process needs saved/restored.  For example:
+other data the process needs saved/restored.  These functions are not
 thread safe, and are typically called only by a main thread.
 For example:
    void save_model(const char* filenamep) {
        VerilatedSave os;
@ -1173,6 +1195,42 @@ compatibility with other simulators.
 A synonym for C<+1800-2017ext+>I<ext>.
 =item --threads I<threads>
 =item --no-threads
 With --threads 0 or --no-threads, the default, the generated model is not
 thread safe. With --threads 1, the generated model is single threaded but
 may run in a multithreaded environment. With --threads N, where N >= 2, the
 model is generated to run multithreaded on up to N threads. See
 L</"MULTITHREADING">.
 =item --threads-dpi all
 =item --threads-dpi none
 =item --threads-dpi pure
 When using --dpi with --threads, control what DPI tasks are thread safe.
 With --threads-dpi all, enable Verilator to assume all DPI imports are
 threadsafe, and to use thread-local storage for communication with DPI,
 potentially improving performance. Any DPI libraries need appropriate
 mutexes to avoid undefined behavior.
 With --threads-dpi none, Verilator assume DPI imports are not thread safe,
 and Verilator will serialize calls to DPI imports by default, potentially
 harming performance.
 With --threads-dpi pure, the default, Verilator assumes DPI pure imports
 are threadsafe, but non-pure DPI imports are not.
 =item --threads-max-mtasks I<value>
 Rarely needed.  When using --threads, specify the number of mtasks the
 model is to be partitioned into. If unspecified, Verilator approximates a
 good value.
 =item --top-module I<topname>
 When the input Verilog contains more than one top level module, specifies
@ -1464,6 +1522,28 @@ Enable debugging at the provided level.
 Display help and exit.
 =item +verilator+prof+threads+file+I<filename>
 When using --prof-threads, the filename to dump to.  Defaults to
 "profile_threads.dat".
 =item +verilator+prof+threads+start+I<value>
 When using --prof-threads, Verilator will wait until $time is at this
 value, then start the profiling warmup, then capturing. Generally this
 should be set to some time that is well within the normal operation of the
 simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
 1.
 =item +verilator+prof+threads+window+I<value>
 When using --prof-threads, after $time reaches
 +verilator+prof+threads+start, Verilator will warm up the profiling for
 this number of eval() calls, then will capture the profiling of this number
 of eval() calls.  Defaults to 2, which makes sense for a
 single-clock-domain module where it's typical to want to capture one
 posedge eval() and one negedge eval().
 =item +verilator+rand+reset+I<value>
 When a model was Verilated using "-x-inital unique", sets the
@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
 of reset bugs in trade for performance; see the above documentation for
 these flags.
 If using Verilated multithreaded, use C<numactl> to ensure you are using
 non-conflicting hardware resources. See L</"MULTITHREADING">.
 Minor Verilog code changes can also give big wins.  You should not have any
 UNOPTFLAT warnings from Verilator.  Fixing these warnings can result in
 huge improvements; one user fixed their one UNOPTFLAT warning by making a
@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
 in obj_dir/Vour_classes.mk.
 =head1 MULTITHREADING
 Verilator experimentally supports multithreading.
 With --no-threads, the default, the model is not thread safe, and any use
 of more than one thread calling into one or even different Verilated models
 may result in unpredictable behavior. This gives the highest single thread
 performance.
 With --threads 1, the generated model is single threaded, however the
 support libraries are multithread safe. This allows different
 instantiations of model(s) to potentially each be run under a different
 thread.  All threading is the responsibility of the user's C++ testbench.
 With --threads N, where N is at least 2, the generated model will be
 designed to run in parallel on N threads. The thread calling eval()
 provides one of those threads, and the generated model will create and
 manage the other N-1 threads. It's the client's responsibility not to
 oversubscribe the available CPU cores. Under CPU oversubscription, the
 Verilated model should not livelock nor deadlock, however, you can expect
 performance to be far worse than it would be with proper stoichiometry of
 threads and CPU cores.
 The remainder of this section describe behavior with --threads 1 or
 --threads N (not --no-threads).
 VL_THREADED is defined when compiling a threaded Verilated module, causing
 the Verilated support classes become threadsafe.
 The thread used for constructing a model must the the same thread that
 calls eval() into the model, this is called the "eval thread". The thread
 used to perform certain global operations such as saving and tracing must
 be done by a "main thread". In most cases the eval thread and main thread
 are the same thread (i.e. the user's top C++ testbench runs on a single
 thread), but this is not required.
 When running a multithreaded model, the default Linux task scheduler often
 works against the model, by assuming threads are short lived, and thus
 often schedules threads using multiple hyperthreads within the same
 physical core. For best performance use the C<numactl> program to (when the
 threading count fits) select unique physical cores on the same socket. For
 example, if a model was Verilated with "--threads 4", we consult
   egrep 'processor|physical id|core id' /proc/cpuinfo
 To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
 but different physical cores.  (Also useful is "numactl --hardware", or
 C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
   numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
 This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
 (presumably on socket 0) optimizing performance.  Of course this must be
 adjusted if you want another simulator using e.g. socket 1, or if you
 Verilated with a different number of threads.  To see what CPUs are
 actually used, use --prof-threads.
 =head2 Multithreaded Verilog and Library Support
 $display/$stop/$finish are delayed until the end of an eval() call in order
 to maintain ordering between threads. This may result in additional tasks
 completing after the $stop or $finish.
 If using --coverage, the coverage routines are fully thread safe.
 If using --dpi, Verilator assumes pure DPI imports are thread safe,
 balancing performance versus saftey. See --threads-dpi.
 If using --savable, the save/restore classes are not multithreaded and are
 must be called only by the eval thread.
 If using --sc, the SystemC kernel is not thread safe, therefore the eval
 thread and main thread must be the same.
 If using --trace, the tracing classes must be constructed and called from
 the main thread.
 If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
 multithreaded, Verilator requires all VPI calls are only made from the main
 thread.
 =back
 =head1 CONFIGURATION FILES
 In addition to the command line, warnings and other features may be
@ -3636,6 +3802,21 @@ section for more details.
 Ignoring this warning will only slow simulations, it will simulate
 correctly.
 =item UNOPTTHREADS
 Warns that the thread scheduler was unable to partition the design to fill
 the requested number of threads.
 One workaround is to request fewer threads with C<--threads>.
 Another possible workaround is to allow more MTasks in the runtime, by
 increasing the value of --threads-max-mtasks. More MTasks will result in
 more communication and synchronization overhead at runtime; the scheduler
 attempts to minimize the number of MTasks for this reason.
 Ignoring this warning will only slow simulations, it will simulate
 correctly.
 =item UNPACKED
 Warns that unpacked structs and unions are not supported.
@ -4185,6 +4366,8 @@ performance gain.
 In 2009, major SystemVerilog and DPI language support was added.
 In 2018, Verilator 4.000 was released with multithreaded support.
 Currently, various language features and performance enhancements are added
 as the need arises.  Verilator is now about 3x faster than in 2002, and is
 faster than many popular commercial simulators.
@ -4282,7 +4465,7 @@ License Version 2.0.
 =head1 SEE ALSO
-L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
+L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,
 L<verilator --help> which is the source for this document,
--- a/bin/verilator_gantt
+++ b/bin/verilator_gantt
@ -0,0 +1,559 @@
 : # -*-Mode: perl;-*- use perl, wherever it is
 eval 'exec perl -wS $0 ${1+"$@"}'
  if 0;
 # See copyright, etc in below POD section.
 ######################################################################
 use strict;
 use warnings;
 use Getopt::Long;
 use Pod::Usage;
 use vars qw ($Debug);
 $Debug = 0;
 my $Opt_File;
 my $Opt_Time_Per_Char = 0;  # rdtsc ticks per char in gantt chart, 0=auto
 my $opt_vcd = "profile_threads.vcd";
 our %Threads;
 our %Mtasks;
 our %Global;
 autoflush STDOUT 1;
 autoflush STDERR 1;
 Getopt::Long::config ("no_auto_abbrev");
 if (! GetOptions (
          "help"        => \&usage,
          "scale=i"     => \$Opt_Time_Per_Char,
          "debug"       => sub { $Debug = 1; },
          "vcd=s"       => \$opt_vcd,
          "no-vcd!"     => sub { $opt_vcd = undef; },
          "<>"          => \&parameter,
    )) {
    die "%Error: Bad usage, try 'verilator_gantt --help'\n";
 }
 $Opt_File = "profile_threads.dat" if !defined $Opt_File;
 process($Opt_File);
 write_vcd($opt_vcd) if defined $opt_vcd;
 exit(0);
 #######################################################################
 sub usage {
    pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
    exit (1);
 }
 sub parameter {
    my $param = shift;
    if (!defined $Opt_File) {
        $Opt_File = $param;
    } else {
        die "%Error: Unknown parameter: $param\n";
    }
 }
 #######################################################################
 sub process {
    my $filename = shift;
    read_data($filename);
    report();
 }
 #######################################################################
 sub read_data {
    my $filename = shift;
    %Global = (rdtsc_cycle_time => 0);
    my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
    while (my $line = $fh->getline) {
        if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
            my $mtask = $1;
            my $start = $2;
            my $end = $3;
            my $elapsed_time = $4;
            my $predict_time = $5;
            my $cpu = $6;
            my $thread = $7;
            $Threads{$thread}{$start}{mtask} = $mtask;
            $Threads{$thread}{$start}{end} = $end;
            $Threads{$thread}{$start}{cpu} = $cpu;
            if (!exists $Mtasks{$mtask}{elapsed}) {
                $Mtasks{$mtask}{elapsed} = 0;
            }
            $Mtasks{$mtask}{elapsed} += $elapsed_time;
            $Mtasks{$mtask}{predict} = $predict_time;
            $Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
        }
        elsif ($line =~ /^VLPROFTHREAD/) {}
        elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
               || $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
            $Global{args}{$1} = $2;
        }
        elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
            $Global{stats}{$1} = $2;
        }
        elsif ($line =~ /^#/) {}
        elsif ($Debug) {
            chomp $line;
            print "Unk: $line\n";
        }
        # TODO -- this is parsing text printed by a client.
        # Really, verilator proper should generate this
        # if it's useful...
        if ($line =~ m/rdtsc time = (\d+) ticks/) {
            $Global{rdtsc_cycle_time} = $1;
        }
    }
 }
 sub report {
    print "Verilator Gantt report\n";
    print "\nArgument settings:\n";
    foreach my $arg (sort keys %{$Global{args}}) {
        my $plus = ($arg =~ /^\+/) ? "+" : " ";
        printf "  %s%s%d\n", $arg, $plus, $Global{args}{$arg};
    }
    my $nthreads = scalar keys %Threads;
    $Global{cpus}{cpu_time} = {};
    foreach my $thread (keys %Threads) {
        # Make potentially multiple characters per column
        foreach my $start (keys %{$Threads{$thread}}) {
            my $cpu = $Threads{$thread}{$start}{cpu};
            my $elapsed = $Threads{$thread}{$start}{end} - $start;
            $Global{cpus}{cpu_time}{$cpu} += $elapsed;
        }
    }
    my $mt_mtask_time = 0;
    my $long_mtask_time = 0;
    my $last_end = 0;
    foreach my $mtask (keys %Mtasks) {
        $mt_mtask_time += $Mtasks{$mtask}{elapsed};
        $last_end = max($last_end, $Mtasks{$mtask}{end});
        $long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
    }
    $Global{last_end} = $last_end;
    report_graph();
    # If we know cycle time in the same (rdtsc) units,
    # this will give us an actual utilization number,
    # (how effectively we keep the cores busy.)
    #
    # It also gives us a number we can compare against
    # serial mode, to estimate the overhead of data sharing,
    # which will show up in the total elapsed time. (Overhead
    # of synchronization and scheduling should not.)
    print "\nAnalysis:\n";
    printf "  Total threads             = %d\n", $nthreads;
    printf "  Total mtasks              = %d\n", scalar (keys %Mtasks);
    printf "  Total cpus used           = %d\n", scalar (keys %{$Global{cpus}});
    printf "  Total yields              = %d\n", $Global{stats}{yields};
    printf "  Total eval time           = %d rdtsc ticks\n", $Global{last_end};
    printf "  Longest mtask time        = %d rdtsc ticks\n", $long_mtask_time;
    printf "  All-thread mtask time     = %d rdtsc ticks\n", $mt_mtask_time;
    my $long_efficiency = $long_mtask_time/($Global{last_end});
    printf "  Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
    my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
    printf "  All-thread efficiency     = %0.1f%%\n", $mt_efficiency*100;
    printf "  All-thread speedup        = %0.1f\n", $mt_efficiency*$nthreads;
    if ($Global{rdtsc_cycle_time} > 0) {
        my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
        print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
    }
    my @p2e_ratios;
    my $min_p2e = 1000000;
    my $min_mtask;
    my $max_p2e = -1000000;
    my $max_mtask;
    foreach my $mtask (sort keys %Mtasks) {
        if ($Mtasks{$mtask}{elapsed} > 0) {
            if ($Mtasks{$mtask}{predict} == 0) {
                $Mtasks{$mtask}{predict} = 1;  # don't log(0) below
            }
            my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
            #print "log(p2e $mtask) = $p2e_ratio   (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
            push @p2e_ratios, $p2e_ratio;
            if ($p2e_ratio > $max_p2e) {
                $max_p2e = $p2e_ratio;
                $max_mtask = $mtask;
            }
            if ($p2e_ratio < $min_p2e) {
                $min_p2e = $p2e_ratio;
                $min_mtask = $mtask;
            }
        }
    }
    print "\nStatistics:\n";
    print "  min log(p2e) = $min_p2e  from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
    print "  max log(p2e) = $max_p2e  from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
    my $stddev = stddev(\@p2e_ratios);
    my $mean = mean(\@p2e_ratios);
    print "  mean = " . ($mean) . "\n";
    print "  stddev = " . ($stddev) . "\n";
    print "  e ^ stddev = " . exp($stddev). "\n";
    print "\n";
 }
 sub report_graph {
    my $time_per = $Opt_Time_Per_Char;
    if ($time_per == 0) {
        $time_per = ($Global{last_end} / 40);  # Start with 40 columns
        while ($time_per > 10) {
            my ($graph, $conflicts) = _make_graph($time_per);
            last if !$conflicts;
            $time_per = int($time_per/2);
        }
        # One more step so we can fit more labels
        $time_per = int($time_per/2);
    }
    my ($graph, $conflicts) = _make_graph($time_per);
    print "\nThread gantt graph:\n";
    print "  Legend: One character width = $time_per rdtsc ticks\n";
    print "  Legend: '&' = multiple mtasks in this period (character width)\n";
    my $scale = "   <-".$Global{last_end}." rdtsc total";
    for (my $col = length($scale);  # -2 for '->' below
         $col < ($Global{last_end}/$time_per); ++$col) {
        $scale .= "-";
    }
    print "  $scale->\n";
    foreach my $thread (sort keys %{$graph}) {
        print "  t: ";
        _print_graph_line($graph->{$thread}, '');
    }
 }
 sub _make_graph {
    my $time_per = shift;
    my $graph = {};  # {thread}{column}{char=>'x' or chars=>#}
    my $conflicts = 0;
    foreach my $thread (keys %Threads) {
        # Make potentially multiple characters per column
        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
            my $end = $Threads{$thread}{$start}{end};
            my $mtask = $Threads{$thread}{$start}{mtask};
            my $cpu = $Threads{$thread}{$start}{cpu};
            my $startcol = _time_col($time_per, $start);
            my $endcol = _time_col($time_per, $end);
            my $label = "[";
            $label .= "$cpu";  # Maybe make optional in future
            my $width = $endcol - $startcol + 1;
            while (length($label) < ($width-1)) {  # -1 for ']'
                $label .= "-";
            }
            $label .= "]";
            $graph->{$thread}[$startcol]{char} .= $label;
        }
        if ($Debug) {
            print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
        }
        # Expand line to one char per column
        for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
            if (my $chars = $graph->{$thread}[$col]{char}) {
                my $ok = 1;
                for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
                    if ($graph->{$thread}[$col + $coladd]{char}) {
                        $ok = 0; last;
                    }
                }
                if (!$ok) {
                    if ($chars =~ /\[.*\[/) {  # Two begins or more
                        $conflicts++;
                        $graph->{$thread}[$col]{char} = "&";
                    } else {
                        $graph->{$thread}[$col]{char} = "[";
                    }
                    for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
                        if ($graph->{$thread}[$col + $coladd]{char}) {
                            last;
                        } else {
                            $graph->{$thread}[$col + $coladd]{char} = 'x';
                        }
                    }
                } else {
                    my $coladd = 0;
                    foreach my $char (split //, $chars) {
                        $graph->{$thread}[$col+$coladd]{char} = $char;
                        ++$coladd;
                    }
                }
            }
        }
        if ($Debug) {
            print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
        }
    }
    print "# Conflicts $conflicts\n" if $Debug;
    return ($graph, $conflicts);
 }
 sub _print_graph_line {
    my $graph_thread = shift;
    my $sep = shift;
    for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
        my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
        print $c, $sep;
    }
    print "\n";
 }
 sub _time_col {
    my $time_per = shift;
    my $time = shift;
    return int($time/$time_per);
 }
 #######################################################################
 sub write_vcd {
    my $filename = shift;
    print "Writing $filename\n";
    my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
    my $vcd = {values => {},  # {<time>}{<code>} = value
               sigs => {},  # {<module>}{<sig}} = code
               code => 0,
    };
    my %parallelism;
    foreach my $thread (keys %Threads) {
        my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
        foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
            my $end = $Threads{$thread}{$start}{end};
            my $mtask = $Threads{$thread}{$start}{mtask};
            my $cpu = $Threads{$thread}{$start}{cpu};
            $vcd->{values}{$start}{$mcode} = $mtask;
            $vcd->{values}{$end}{$mcode} = undef;
            $parallelism{$start}++;
            $parallelism{$end}--;
            my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
            $vcd->{values}{$start}{$ccode} = $thread;
            $vcd->{values}{$end}{$ccode} = undef;
            my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
            $vcd->{values}{$start}{$mcode} = $cpu;
            $vcd->{values}{$end}{$mcode} = undef;
        }
    }
    {
        my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
        my $value = 0;
        foreach my $time (sort {$a<=>$b} keys %parallelism) {
            $value += $parallelism{$time};
            $vcd->{values}{$time}{$pcode} = $value;
        }
    }
    $fh->print('$version Generated by verilator_gantt $end'."\n");
    $fh->print('$timescale 1ns $end'."\n");
    $fh->print("\n");
    my %all_codes;
    $fh->print(' $scope module gantt $end'."\n");
    foreach my $module (sort keys %{$vcd->{sigs}}) {
        $fh->printf('  $scope module %s $end'."\n", $module);
        foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
            my $code = $vcd->{sigs}{$module}{$sig};
            $fh->printf('   $var wire 32 v%x %s [31:0] $end'."\n",
                        $code, $sig);
            $all_codes{$code} = 1;
        }
        $fh->print('  $upscope $end'."\n");
    }
    $fh->print(' $upscope $end'."\n");
    $fh->print('$enddefinitions $end'."\n");
    $fh->print("\n");
    my $first = 1;
    foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
        if ($first) {
            $first = 0;
            # Start with Z for any signals without time zero data
            foreach my $code (keys %all_codes) {
                if (!defined $vcd->{values}{$time}{$code}) {
                    $vcd->{values}{$time}{$code} = undef;
                }
            }
        }
        $fh->printf("#%d\n", $time);
        foreach my $code (sort keys %{$vcd->{values}{$time}}) {
            my $value = $vcd->{values}{$time}{$code};
            if (defined $value) {
                $fh->printf("b%b v%x\n", $value, $code);
            } else {
                $fh->printf("bz v%x\n", $code);
            }
        }
    }
 }
 #######################################################################
 # Similar to Statistics::Basic functions, but avoid a package dependency
 sub max {
    my $n = $_[0]; shift;
    while (defined $_[0]) {
        $n = $_[0] if !defined $n || $_[0] > $n;
        shift;
    }
    return $n;
 }
 sub mean {
    my $arrayref = shift;
    my $n = 0;
    my $sum = 0;
    foreach my $v (@$arrayref) {
        $sum += $v;
        $n++;
    }
    return undef if !$n;
    return $sum/$n;
 }
 sub stddev {
    my $arrayref = shift;
    my $n = 0;
    my $sum = 0;
    my $sumsq = 0;
    foreach my $v (@$arrayref) {
        $sum += $v;
        $sumsq += $v**2;
        $n++;
    }
    return undef if !$n;
    return sqrt(($sumsq/$n) - ($sum/$n)**2);
 }
 #######################################################################
 __END__
 =pod
 =head1 NAME
 verilator_gantt - Create Gantt chart of multi-threaded execution
 =head1 SYNOPSIS
 Creates a visual representation to help analyze Verilator multithreaded
 simulation performance, by showing when each macro-task starts and ends,
 and showing when each thread is busy or idle.
 The generated Gantt chart has time on the X-axis. Times shown are to the
 scale printed, i.e. a certain about of time for each character width.  The
 Y-axis shows threads, each thread's execution is shown on one line.  That
 line shows "[" at the position in time when it executes.
 Following the "[" is the cpu number the task executed on, followed by zero
 or more "-" to make the width of the characters match the scaled execution
 time, followed by a "]".  If the scale is too small, the cpu number and
 mtask number will not be printed.  If the scale is very small, a "&"
 indicates multiple mtasks started at that time position.
 Also creates a value change dump (VCD) format dump file which may be viewed
 in a waveform viewer (e.g. C<GTKWave>).  See below.
 =head1 USAGE
  Build with --prof-threads.
  Run a sim with +verilator+prof+threads+window 2.
  This will create profile_threads.dat.
  Then run:
  verilator_gantt profile_threads.dat
  The report will be printed on standard output, this also generates
  profile_threads.vcd
  View profile_threads.vcd in a waveform viewer.
 =head1 VCD SIGNALS
 In waveforms there are the following signals. Most signals the "decimal"
 format will remove the leading zeros and make the traces easier to read.
 parallelism: The number of mtasks active at this time, for best performance
 this will match the thread count. You may want to use an "analog step"
 format to view this signal.
 cpu#_thread: For the given CPU number, the thread number executing.
 mtask#_cpu; For the given mtask id, the CPU it is executing on.
 thread#_mtask: For the given thread number, the mtask id executing.
 =head1 ARGUMENTS
 =over 4
 =item I<filename>
 The filename to read data from, defaults to "profile_threads.dat".
 =item --help
 Displays this message and program version and exits.
 =item --scale I<n>
 On the X-axis of the generated Gantt chart, each character represents this
 many time units. (On x86, time units are rdtsc ticks.)  Defaults to 0,
 which will automatically compute a reasonable scale where no two mtasks
 need to fit into same character width's worth of scaled time.
 =item --no-vcd
 =item --vcd I<filename>
 Set output filename for vcd dump, or disable. Default is
 verilator_gantt.vcd.
 =back
 =head1 DISTRIBUTION
 The latest version is available from L<http://www.veripool.org/>.
 Copyright 2018-2018 by Wilson Snyder.  Verilator is free software; you can
 redistribute it and/or modify it under the terms of either the GNU Lesser
 General Public License Version 3 or the Perl Artistic License Version 2.0.
 =head1 AUTHORS
 Wilson Snyder <wsnyder@wsnyder.org>
 =head1 SEE ALSO
 C<verilator>
 =cut
 ######################################################################
 ### Local Variables:
 ### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
 ### End:
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;
 // Keep below together in one cache line
 Verilated::Serialized Verilated::s_s;
 Verilated::NonSerialized Verilated::s_ns;
 VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
 Verilated::CommandArgValues Verilated::s_args;
@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
    s_fatalOnVpiError = true; // retains old default behaviour
 }
 Verilated::NonSerialized::NonSerialized() {
    s_profThreadsStart = 1;
    s_profThreadsWindow = 2;
    s_profThreadsFilenamep = strdup("profile_threads.dat");
 }
 Verilated::NonSerialized::~NonSerialized() {
    if (s_profThreadsFilenamep) {
        free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
    }
 }
 //===========================================================================
 // Random reset -- Only called at init time, so don't inline.
@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
    VerilatedLockGuard lock(m_mutex);
    s_s.s_fatalOnVpiError = flag;
 }
 void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
    VerilatedLockGuard lock(m_mutex);
    s_ns.s_profThreadsStart = flag;
 }
 void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
    VerilatedLockGuard lock(m_mutex);
    s_ns.s_profThreadsWindow = flag;
 }
 void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
    VerilatedLockGuard lock(m_mutex);
    if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
    s_ns.s_profThreadsFilenamep = strdup(flagp);
 }
 const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
    // Returns new'ed data
@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
            VL_PRINTF_MT("For help, please see 'verilator --help'\n");
            VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
        }
        else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
            Verilated::profThreadsStart(atoll(value.c_str()));
        }
        else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
            Verilated::profThreadsWindow(atol(value.c_str()));
        }
        else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
            Verilated::profThreadsFilenamep(value.c_str());
        }
        else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
            Verilated::randReset(atoi(value.c_str()));
        }
--- a/include/verilated.h
+++ b/include/verilated.h
@ -344,6 +344,17 @@ class Verilated {
        ~Serialized() {}
    } s_s;
    static struct NonSerialized {  // Non-serialized information
        // These are reloaded from on command-line settings, so do not need to persist
        // Fast path
        vluint64_t s_profThreadsStart;  ///< +prof+threads starting time
        vluint32_t s_profThreadsWindow;  ///< +prof+threads window size
        // Slow path
        const char* s_profThreadsFilenamep;  ///< +prof+threads filename
        NonSerialized();
        ~NonSerialized();
    } s_ns;
    // no need to be save-restored (serialized) the
    // assumption is that the restore is allowed to pass different arguments
    static struct CommandArgValues {
@ -409,6 +420,14 @@ public:
    /// Enable/disable vpi fatal
    static void fatalOnVpiError(bool flag) VL_MT_SAFE;
    static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
    /// --prof-threads related settings
    static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
    static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
    static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
    static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
    static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
    static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
    /// Flush callback for VCD waves
    static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
    static void flushCall() VL_MT_SAFE;
--- a/include/verilated_threads.cpp
+++ b/include/verilated_threads.cpp
@ -0,0 +1,229 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //=============================================================================
 //
 // THIS MODULE IS PUBLICLY LICENSED
 //
 // Copyright 2012-2018 by Wilson Snyder.  This program is free software;
 // you can redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
 //
 // This is distributed in the hope that it will be useful, but WITHOUT ANY
 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 // for more details.
 //
 //=============================================================================
 ///
 /// \file
 /// \brief Thread pool for verilated modules
 ///
 //=============================================================================
 #include "verilatedos.h"
 #include "verilated_threads.h"
 #include <cstdio>
 std::atomic<vluint64_t> VlNotification::s_yields;
 VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
 //=============================================================================
 // VlMTaskVertex
 VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
    : m_upstreamDepsDone(0),
      m_upstreamDepCount(upstreamDepCount) {
    assert(atomic_is_lock_free(&m_upstreamDepsDone));
 }
 //=============================================================================
 // VlWorkerThread
 VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
    : m_poolp(poolp)
    , m_profiling(profiling)
    , m_exiting(false)
      // Must init this last -- after setting up fields that it might read:
    , m_cthread(startWorker, this) {}
 VlWorkerThread::~VlWorkerThread() {
    m_exiting.store(true, std::memory_order_release);
    {
        VerilatedLockGuard lk(m_mutex);
        if (sleeping()) {
            wakeUp();
        }
    }
    // The thread should exit; join it.
    m_cthread.join();
 }
 void VlWorkerThread::workerLoop() {
    if (VL_UNLIKELY(m_profiling)) {
        m_poolp->setupProfilingClientThread();
    }
    VlNotification alarm;
    ExecRec work;
    work.m_fnp = NULL;
    while (1) {
        bool sleep = false;
        if (VL_UNLIKELY(!work.m_fnp)) {
            // Look for work
            VerilatedLockGuard lk(m_mutex);
            if (VL_LIKELY(!m_ready.empty())) {
                dequeWork(&work);
            } else {
                // No work available, prepare to sleep. Pass alarm/work
                // into m_sleepAlarm so wakeUp will tall this function.
                //
                // Must modify m_sleepAlarm in the same critical section as
                // the check for ready work, otherwise we could race with
                // another thread enqueueing work and never be awoken.
                m_sleepAlarm.first = &alarm;
                m_sleepAlarm.second = &work;
                sleep = true;
            }
        }
        // Do this here, not above, to avoid a race with the destructor.
        if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
            break;
        if (VL_UNLIKELY(sleep)) {
            alarm.waitForNotification();  // ZZZzzzzz
            alarm.reset();
        }
        if (VL_LIKELY(work.m_fnp)) {
            work.m_fnp(work.m_evenCycle, work.m_sym);
            work.m_fnp = NULL;
        }
    }
    if (VL_UNLIKELY(m_profiling)) {
        m_poolp->tearDownProfilingClientThread();
    }
 }
 void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
    workerp->workerLoop();
 }
 //=============================================================================
 // VlThreadPool
 VlThreadPool::VlThreadPool(int nThreads, bool profiling)
    : m_profiling(profiling) {
    // --threads N passes nThreads=N-1, as the "main" threads counts as 1
    unsigned cpus = std::thread::hardware_concurrency();
    if (cpus < nThreads+1) {
        VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
                     " --threads %d; may run slow.\n", cpus, nThreads+1);
    }
    // Create'em
    for (int i=0; i<nThreads; ++i) {
        m_workers.push_back(new VlWorkerThread(this, profiling));
    }
    // Set up a profile buffer for the current thread too -- on the
    // assumption that it's the same thread that calls eval and may be
    // donated to run mtasks during the eval.
    if (VL_UNLIKELY(m_profiling)) {
        setupProfilingClientThread();
    }
 }
 VlThreadPool::~VlThreadPool() {
    for (int i = 0; i < m_workers.size(); ++i) {
        // Each ~WorkerThread will wait for its thread to exit.
        delete m_workers[i];
    }
    if (VL_UNLIKELY(m_profiling)) {
        tearDownProfilingClientThread();
    }
 }
 void VlThreadPool::tearDownProfilingClientThread() {
    assert(t_profilep);
    delete t_profilep;
    t_profilep = NULL;
 }
 void VlThreadPool::setupProfilingClientThread() {
    assert(!t_profilep);
    t_profilep = new ProfileTrace;
    // Reserve some space in the thread-local profiling buffer;
    // try not to malloc while collecting profiling.
    t_profilep->reserve(4096);
    {
        VerilatedLockGuard lk(m_mutex);
        m_allProfiles.insert(t_profilep);
    }
 }
 void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
    VerilatedLockGuard lk(m_mutex);
    for (ProfileSet::iterator it = m_allProfiles.begin();
         it != m_allProfiles.end(); ++it) {
        // Every thread's profile trace gets a copy of rec.
        (*it)->emplace_back(rec);
    }
 }
 void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
    VerilatedLockGuard lk(m_mutex);
    VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
    FILE* fp = fopen(filenamep, "w");
    if (VL_UNLIKELY(!fp)) {
        VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
        return;
    }
    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
    fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
    fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
            vluint64_t(m_workers.size()+1));
    fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
            Verilated::profThreadsStart());
    fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
            Verilated::profThreadsWindow());
    fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
            VlNotification::yields());
    vluint32_t thread_id = 0;
    for (ProfileSet::iterator pit = m_allProfiles.begin();
         pit != m_allProfiles.end(); ++pit) {
        ++thread_id;
        bool printing = false;  // False while in warmup phase
        for (ProfileTrace::iterator eit = (*pit)->begin();
             eit != (*pit)->end(); ++eit) {
            switch (eit->m_type) {
            case VlProfileRec::TYPE_BARRIER:
                printing = true;
                break;
            case VlProfileRec::TYPE_MTASK_RUN:
                if (!printing) break;
                fprintf(fp, "VLPROF mtask %d"
                        " start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
                        " predict_time %u cpu %u on thread %u\n",
                        eit->m_mtaskId,
                        eit->m_startTime,
                        eit->m_endTime,
                        (eit->m_endTime - eit->m_startTime),
                        eit->m_predictTime,
                        eit->m_cpu,
                        thread_id);
                break;
            default: assert(false);
                break;
            }
        }
    }
    fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
            ticksElapsed);
    fclose(fp);
 }
--- a/include/verilated_threads.h
+++ b/include/verilated_threads.h
@ -0,0 +1,313 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //=============================================================================
 //
 // THIS MODULE IS PUBLICLY LICENSED
 //
 // Copyright 2012-2018 by Wilson Snyder.  This program is free software;
 // you can redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
 //
 // This is distributed in the hope that it will be useful, but WITHOUT ANY
 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 // for more details.
 //
 //=============================================================================
 ///
 /// \file
 /// \brief Thread pool and profiling for Verilated modules
 ///
 //=============================================================================
 #ifndef _VERILATED_THREADS_H_
 #define _VERILATED_THREADS_H_
 #include "verilatedos.h"
 #include <atomic>
 #include <thread>
 #include <vector>
 #include <set>
 #include <sched.h>  // For sched_getcpu()
 #include "verilated.h"  // for VerilatedMutex and clang annotations
 // VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
 // Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
 // as a void* here.
 typedef void* VlThrSymTab;
 class VlNotification {
    // MEMBERS
    std::atomic<bool> m_notified;  // Notification pending
    static std::atomic<vluint64_t> s_yields;  // Statistics
 public:
    // CONSTRUCTORS
    VlNotification()
        : m_notified(false) {
        assert(atomic_is_lock_free(&m_notified));
    }
    ~VlNotification() {}
    // METHODS
    static vluint64_t yields() { return s_yields; }
    // Block until notify() has occurred, then return.
    // If notify() has already occurred, return immediately.
    //
    // This is logically const: the object will remain in notified state
    // after WaitForNotification() returns, so you could notify more than
    // one thread of the same event.
    inline void waitForNotification() {
        unsigned ct = 0;
        while (VL_UNLIKELY(!notified())) {
            VL_CPU_RELAX();
            ct++;
            if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
                ct = 0;
                ++s_yields;  // Statistics
                std::this_thread::yield();
            }
        }
    }
    // The 'inline' keyword here means nothing to the compiler, it's
    // implicit on methods defined within the class body anyway.
    //
    // 'inline' is attached the this method, and others in this file,
    // to remind humans that some routines in this file are called many
    // times per cycle in threaded mode. Such routines should be
    // inlinable; that's why they're declared in the .h and not the .cpp.
    inline bool notified() {
        return m_notified.load(std::memory_order_acquire);
    }
    // Set notified state. If state is already notified,
    // it remains so.
    inline void notify() {
        m_notified.store(true, std::memory_order_release);
    }
    // Reset the state to un-notified state, which is also the
    // state of a new Notification object.
    inline void reset() {
        m_notified.store(false, std::memory_order_relaxed);
    }
 };
 typedef void (*VlExecFnp)(bool, VlThrSymTab);
 /// Track dependencies for a single MTask.
 class VlMTaskVertex {
    // MEMBERS
    // On even cycles, _upstreamDepsDone increases as upstream
    // dependencies complete. When it reaches _upstreamDepCount,
    // this MTaskVertex is ready.
    //
    // On odd cycles, _upstreamDepsDone decreases as upstream
    // dependencies complete, and when it reaches zero this MTaskVertex
    // is ready.
    //
    // An atomic is smaller than a mutex, and lock-free.
    //
    // (Why does the size of this class matter? If an mtask has many
    // downstream mtasks to notify, we hope these will pack into a
    // small number of cache lines to reduce the cost of pointer chasing
    // during done-notification. Nobody's quantified that cost though.
    // If we were really serious about shrinking this class, we could
    // use 16-bit types here...)
    std::atomic<vluint32_t> m_upstreamDepsDone;
    const vluint32_t m_upstreamDepCount;
 public:
    // CONSTRUCTORS
    // 'upstreamDepCount' is the number of upstream MTaskVertex's
    // that must notify this MTaskVertex before it will become ready
    // to run.
    explicit VlMTaskVertex(vluint32_t upstreamDepCount);
    ~VlMTaskVertex() {}
    // Upstream mtasks must call this when they complete.
    // Returns true when the current MTaskVertex becomes ready to execute,
    // false while it's still waiting on more dependencies.
    inline bool signalUpstreamDone(bool evenCycle) {
        if (evenCycle) {
            vluint32_t upstreamDepsDone
                = 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
            assert(upstreamDepsDone <= m_upstreamDepCount);
            return (upstreamDepsDone == m_upstreamDepCount);
        } else {
            vluint32_t upstreamDepsDone_prev
                = m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
            assert(upstreamDepsDone_prev > 0);
            return (upstreamDepsDone_prev == 1);
        }
    }
    inline bool areUpstreamDepsDone(bool evenCycle) const {
        vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
        return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
    }
    inline void waitUntilUpstreamDone(bool evenCycle) const {
        while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
            VL_CPU_RELAX();
        }
    }
 };
 // Profiling support
 class VlProfileRec {
 protected:
    friend class VlThreadPool;
    enum VlProfileE {
        TYPE_MTASK_RUN,
        TYPE_BARRIER
    };
    VlProfileE m_type;  // Record type
    vluint32_t m_mtaskId;  // Mtask we're logging
    vluint32_t m_predictTime;  // How long scheduler predicted would take
    vluint64_t m_startTime;  // Tick at start of execution
    vluint64_t m_endTime;  // Tick at end of execution
    unsigned m_cpu;  // Execution CPU number (at start anyways)
 public:
    class Barrier {};
    VlProfileRec() {}
    explicit VlProfileRec(Barrier) {
        m_type = TYPE_BARRIER;
        m_mtaskId = 0;
        m_predictTime = 0;
        m_startTime = 0;
        m_cpu = sched_getcpu();
    }
    void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
        m_type = VlProfileRec::TYPE_MTASK_RUN;
        m_mtaskId = mtask;
        m_predictTime = predict;
        m_startTime = time;
        m_cpu = sched_getcpu();
    }
    void endRecord(vluint64_t time) {
        m_endTime = time;
    }
 };
 class VlThreadPool;
 class VlWorkerThread {
 private:
    // TYPES
    struct ExecRec {
        VlExecFnp m_fnp;  // Function to execute
        VlThrSymTab m_sym;  // Symbol table to execute
        bool m_evenCycle;  // Even/odd for flag alternation
        ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
        ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
            : m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
    };
    // MEMBERS
    VerilatedMutex m_mutex;
    // Why a vector? We expect the pending list to be very short, typically
    // 0 or 1 or 2, so popping from the front shouldn't be
    // expensive. Revisit if we ever have longer queues...
    std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
    VlThreadPool* m_poolp;  // Our associated thread pool
    // If values stored are non-NULL, the thread is asleep pending new
    // work. If the thread is not asleep, both parts of m_sleepAlarm must
    // be NULL.
    std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
    bool m_profiling;  // Is profiling enabled?
    std::atomic<bool> m_exiting;  // Worker thread should exit
    std::thread m_cthread;  // Underlying C++ thread record
    VL_UNCOPYABLE(VlWorkerThread);
 public:
    // CONSTRUCTORS
    explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
    ~VlWorkerThread();
    // METHODS
    inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
        // As noted above this is inefficient if our ready list is ever
        // long (but it shouldn't be)
        *workp = m_ready.front();
        m_ready.erase(m_ready.begin());
    }
    inline void wakeUp() VL_REQUIRES(m_mutex) {
        VlNotification* notifyp = m_sleepAlarm.first;
        m_sleepAlarm.first = NULL;  // NULL+NULL means wake
        m_sleepAlarm.second = NULL;
        notifyp->notify();
    }
    inline bool sleeping() VL_REQUIRES(m_mutex) {
        return (m_sleepAlarm.first != NULL);
    }
    inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
        VerilatedLockGuard lk(m_mutex);
        m_ready.emplace_back(fnp, evenCycle, sym);
        if (VL_LIKELY(sleeping())) {  // Generally queue is waiting for work
            // Awaken thread
            dequeWork(m_sleepAlarm.second);
            wakeUp();
        }
    }
    void workerLoop();
    static void startWorker(VlWorkerThread* workerp);
 };
 class VlThreadPool {
    // TYPES
    typedef std::vector<VlProfileRec> ProfileTrace;
    typedef std::set<ProfileTrace*> ProfileSet;
    // MEMBERS
    std::vector<VlWorkerThread*> m_workers;  // our workers
    bool m_profiling;  // is profiling enabled?
    // Support profiling -- we can append records of profiling events
    // to this vector with very low overhead, and then dump them out
    // later. This prevents the overhead of printf/malloc/IO from
    // corrupting the profiling data. It's super cheap to append
    // a VlProfileRec struct on the end of a pre-allocated vector;
    // this is the only cost we pay in real-time during a profiling cycle.
    static VL_THREAD_LOCAL ProfileTrace* t_profilep;
    ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
    VerilatedMutex m_mutex;
 public:
    // CONSTRUCTORS
    // Construct a thread pool with 'nThreads' dedicated threads. The thread
    // pool will create these threads and make them available to execute tasks
    // via this->workerp(index)->addTask(...)
    VlThreadPool(int nThreads, bool profiling);
    ~VlThreadPool();
    // METHODS
    inline int numThreads() const {
        return m_workers.size();
    }
    inline VlWorkerThread* workerp(int index) {
        assert(index >= 0);
        assert(index < m_workers.size());
        return m_workers[index];
    }
    inline VlProfileRec* profileAppend() {
        t_profilep->emplace_back();
        return &(t_profilep->back());
    }
    void profileAppendAll(const VlProfileRec& rec);
    void profileDump(const char* filenamep, vluint64_t ticksElapsed);
    // In profiling mode, each executing thread must call
    // this once to setup profiling state:
    void setupProfilingClientThread();
    void tearDownProfilingClientThread();
 private:
    VL_UNCOPYABLE(VlThreadPool);
 };
 #endif
--- a/internals.pod
+++ b/internals.pod
@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.
 =back
 =head2 Multithreaded Mode
 In --threads mode, the frontend of the Verilator pipeline is the same as
 serial mode, up until V3Order.
 V3Order builds a fine-grained, statement-level dependency graph that governs
 the ordering of code within a single eval() call. In serial mode, that
 dependency graph is used to order all statements into a total serial order.
 In parallel mode, the same dependency graph is the starting point for a
 partitioner (V3Partition).
 The partitioner's goal is to coarsen the fine-grained DAG into a coarser
 DAG, while maintaining as much available parallelism as possible. Often the
 partitioner can transform an input graph with millions of nodes into a
 coarsened execution graph with a few dozen nodes, while maintaining enough
 parallelism to take advantage of a modern multicore CPU. Runtime
 synchronization cost is not prohibitive with so few nodes.
 =head3 Partitioning
 Our partitioner is similar to the one Vivek Sarkar described in his 1989
 paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
 Let's define some terms:
 =over 4
 =item C<Par Factor>
 The available parallelism or "par-factor" of a DAG is the total cost to
 execute all nodes, divided by the cost to execute the longest critical path
 through the graph. This is the speedup you would get from running the graph
 in parallel, if given infinite CPU cores available and communication and
 synchronization are zero.
 =item C<Macro Task>
 When the partitioner coarsens the graph, it combines nodes together. Each
 fine-grained node represents an atomic "task"; combined nodes in the
 coarsened graph are "macro-tasks". This term comes from Sarkar. Each
 macro-task executes from start to end on one processor, without any
 synchronization to any other macro-task during its
 execution. (Synchronization only happens before the macro-task begins or
 after it ends.)
 =item C<Edge Contraction>
 Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
 coarsen the graph. It starts with one macro-task per atomic task and
 iteratively combines pairs of edge-connected macro-tasks.
 =item C<Local Critical Path>
 Each node in the graph has a "local" critical path. That's the critical
 path from the start of the graph to the start of the node, plus the node's
 cost, plus the critical path from the end of the node to the end of the
 graph.
 =back
 Sarkar calls out an important trade-off: coarsening the graph reduces
 runtime synchronization overhead among the macro-tasks, but it tends to
 increase the critical path through the graph and thus reduces par-factor.
 Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
 that the growth in critical path is minimized. Each candidate merge would
 result in a new node, which would have some local critical path. We choose
 the candidate that would produce the shortest local critical path. Repeat
 until par-factor falls to a target threshold. It's a greedy algorithm, and
 it's not guaranteed to produce the best partition (which Sarkar proves is
 NP-hard).
 =head3 Estimating Logic Costs
 To compute the cost of any given path through the graph, Verilator
 estimates an execution cost for each task. Each macro-task has an execution
 cost which is simply the sum of its tasks' costs. We assume that
 communication overhead and synchronization overhead are zero, so the cost
 of any given path through the graph is simply the sum of macro-task
 execution costs. Sarkar does almost the same thing, except that he has
 nonzero estimates for synchronization costs.
 Verilator's cost estimates are assigned by the InstrCountCostVisitor.  This
 class is perhaps the most fragile piece of the multithread implementation.
 It's easy to have a bug where you count something cheap (eg. accessing one
 element of a huge array) as if it were expensive (eg. by counting it as if
 it were an access to the entire array.) Even without such gross bugs, the
 estimates this produce are only loosely predictive of actual runtime cost.
 Multithread performance would be better with better runtime costs
 estimates.  This is an area to improve.
 =head3 Scheduling Macro-Tasks at Runtime
 After coarsening the graph, we must schedule the macro-tasks for runtime.
 Sarkar describes two options: you can dynamically schedule tasks at
 runtime, with a runtime graph follower. Sarkar calls this the
 "macro-dataflow model."  Verilator does not support this; early experiments
 with this approach had poor performance.
 The other option is to statically assign macro-tasks to threads, with each
 thread running its macro-tasks in a static order. Sarkar describes this in
 Chapter 5. Verilator takes this static approach. The only dynamic aspect is
 that each macro task may block before starting, to wait until its
 prerequisites on other threads have finished.
 The synchronization cost is cheap if the prereqs are done. If they're not,
 fragmentation (idle CPU cores waiting) is possible. This is the major
 source of overhead in this approach. The --prof-threads switch and the
 C<verilator_gantt> script can visualize the time lost to such
 fragmentation.
 =head3 Locating Variables for Best Spatial Locality
 After scheduling all code, we attempt to locate variables in memory such
 that variables accessed by a single macro-task are close together in
 memory.  This provides "spatial locality" -- when we pull in a 64-byte
 cache line to access a 2-byte variable, we want the other 62 bytes to be
 ones we'll also likely access soon, for best cache performance.
 This turns out to be critical for performance. It should allow Verilator
 to scale to very large models. We don't rely on our working set fitting
 in any CPU cache; instead we essentially "stream" data into caches from
 memory. It's not literally streaming, where the address increases
 monotonically, but it should have similar performance characteristics,
 so long as each macro-task's dataset fits in one core's local caches.
 To achieve spatial locality, we tag each variable with the set of
 macro-tasks that access it. Let's call this set the "footprint" of that
 variable. The variables in a given module have a set of footprints. We can
 order those footprints to minimize the distance between them (distance is
 the number of macro-tasks that are different across any two footprints) and
 then emit all variables into the struct in ordered-footprint order.
 The footprint ordering is literally the traveling salesman problem, and we
 use a TSP-approximation algorithm to get close to an optimal sort.
 This is an old idea. Simulators designed at DEC in the early 1990s used
 similar techniques to optimize both single-thread and multi-thread modes.
 (Verilator does not optimize variable placement for spatial locality in
 serial mode; that is a possible area for improvement.)
 =head3 Improving Multithreaded Performance Further (a TODO list)
 =over 4
 =item C<Wave Scheduling>
 To allow the verilated model to run in parallel with the testbench, it
 might be nice to support "wave" scheduling, in which work on a cycle begins
 before eval() is called or continues after eval() returns.  For now all
 work on a cycle happens during the eval() call, leaving Verilator's threads
 idle while the testbench (everything outside eval()) is working. This would
 involve fundamental changes within the partitioner, however, it's probably
 the best bet for hiding testbench latency.
 =item C<Efficient Dynamic Scheduling>
 To scale to more than a few threads, we may revisit a fully dynamic
 scheduler. For large (>16 core) systems it might make sense to dedicate an
 entire core to scheduling, so that scheduler data structures would fit in
 its L1 cache and thus the cost of traversing priority-ordered ready lists
 would not be prohibitive.
 =item C<Static Scheduling with Runtime Repack>
 We could modify the static scheduling approach by gathering actual
 macro-task execution times at run time, and dynamically re-packing the
 macro-tasks into the threads also at run time. Say, re-pack once every
 10,000 cycles or something. This has the potential to do better than our
 static estimates about macro-task run times. It could potentially react to
 CPU cores that aren't performing equally, due to NUMA or thermal throttling
 or nonuniform competing memory traffic or whatever.
 =item C<Clock Domain Balancing>
 Right now Verilator makes no attempt to balance clock domains across
 macro-tasks. For a multi-domain model, that could lead to bad gantt chart
 fragmentation. This could be improved if it's a real problem in practice.
 =item C<Other Forms of MTask Balancing>
 The largest source of runtime overhead is idle CPUs, which happens due to
 variance between our predicted runtime for each MTask and its actual
 runtime. That variance is magnified if MTasks are homogeneous, containing
 similar repeating logic which was generally close together in source code
 and which is still packed together even after going through Verilator's
 digestive tract.
 If Verilator could avoid doing that, and instead would take source logic
 that was close together and distribute it across MTasks, that would
 increase the diversity of any given MTask, and this should reduce variance
 in the cost estimates.
 One way to do that might be to make various "tie breaker" comparison
 routines in the sources to rely more heavily on randomness, and generally
 try harder not to keep input nodes together when we have the option to
 scramble things.
 =item C<Performance Regression>
 It would be nice if we had a regression of large designs, with some
 diversity of design styles, to test on both single- and multi-threaded
 modes. This would help to avoid performance regressions, and also to
 evaluate the optimizations while minimizing the impact of parasitic noise.
 =item C<Per-Instance Classes>
 If we have multiple instances of the same module, and they partition
 differently (likely; we make no attempt to partition them the same) then
 the variable sort will be suboptimal for either instance.  A possible
 improvement would be to emit a unique class for each instance of a module,
 and sort its variables optimally for that instance's code stream.
 =back
 =head2 Verilated Flow
 The evaluation loop outputted by Verilator is designed to allow a single
--- a/nodist/install_test
+++ b/nodist/install_test
@ -64,6 +64,7 @@ sub test {
 	run("test -e $prefix/bin/verilator");
 	run("test -e $prefix/bin/verilator_bin");
 	run("test -e $prefix/bin/verilator_bin_dbg");
        run("test -e $prefix/bin/verilator_gantt");
 	run("test -e $prefix/bin/verilator_profcfunc");
    }
--- a/src/Makefile_obj.in
+++ b/src/Makefile_obj.in
@ -217,6 +217,7 @@ RAW_OBJS = \
 	V3Order.o \
 	V3Os.o \
 	V3Param.o \
 	V3Partition.o \
 	V3PreShell.o \
 	V3Premit.o \
 	V3Reloop.o \
--- a/src/V3Ast.h
+++ b/src/V3Ast.h
@ -29,16 +29,24 @@
 #include <vector>
 #include <cmath>
 #include <map>
 #include VL_INCLUDE_UNORDERED_SET
 #include "V3Ast__gen_classes.h"	// From ./astgen
 // Things like:
 //   class V3AstNode;
 // Forward declarations
 class V3Graph;
 class ExecMTask;
 // Hint class so we can choose constructors
 class VFlagLogicPacked {};
 class VFlagBitPacked {};
 class VFlagChildDType {};  // Used by parser.y to select constructor that sets childDType
 // Used as key for another map, needs operator<, hence not an unordered_set
 typedef std::set<int> MTaskIdSet;  // Set of mtaskIds for Var sorting
 //######################################################################
 // For broken() function, return error string if have a match
--- a/src/V3AstNodes.cpp
+++ b/src/V3AstNodes.cpp
@ -31,6 +31,8 @@
 #include "V3Ast.h"
 #include "V3File.h"
 #include "V3Global.h"
 #include "V3Graph.h"
 #include "V3PartitionGraph.h"  // Just for mtask dumping
 //======================================================================
 // Special methods
@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
    }
 }
 AstExecGraph::AstExecGraph(FileLine* fileline)
    : AstNode(fileline) {
    m_depGraphp = new V3Graph;
 }
 AstExecGraph::~AstExecGraph() {
    delete m_depGraphp; VL_DANGLING(m_depGraphp);
 }
 bool AstVar::isSigPublic() const {
    return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
 }
 bool AstVar::isScQuad() const {
    return (isSc() && isQuad() && !isScBv() && !isScBigUint());
 }
 bool AstVar::isScBv() const {
    return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
 }
 bool AstVar::isScUint() const {
    return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
 }
 bool AstVar::isScBigUint() const {
    return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
 }
@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
    return NULL;
 }
 string AstVar::mtasksString() const {
    std::ostringstream os;
    os<<" all: ";
    for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
         it != m_mtaskIds.end(); ++it) {
        os<<*it<<" ";
    }
    return os.str();
 }
 AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
    // dimension passed from AstArraySel::dimension
    // Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
        str<<" decl"<<declRange();
    }
 }
 void AstMTaskBody::dump(std::ostream& str) {
    this->AstNode::dump(str);
    str<<" ";
    m_execMTaskp->dump(str);
 }
 void AstTypeTable::dump(std::ostream& str) {
    this->AstNode::dump(str);
    for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@ -1124,6 +1124,7 @@ private:
    bool	m_noSubst:1;	// Do not substitute out references
    bool	m_trace:1;	// Trace this variable
    AstVarAttrClocker m_attrClocker;
    MTaskIdSet  m_mtaskIds;  // MTaskID's that read or write this var
    void	init() {
 	m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
@ -1323,6 +1324,10 @@ public:
 	if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
    }
    static AstVar* scVarRecurse(AstNode* nodep);
    void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
    void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
    string mtasksString() const;
 };
 class AstDefParam : public AstNode {
@ -5698,6 +5703,44 @@ public:
    AstNode* bodysp() const { return op1p(); }  // op1= expressions to print
 };
 class AstMTaskBody : public AstNode {
    // Hold statements for each MTask
 private:
    ExecMTask* m_execMTaskp;
 public:
    explicit AstMTaskBody(FileLine* flp)
        : AstNode(flp)
        , m_execMTaskp(NULL) {}
    ASTNODE_NODE_FUNCS(MTaskBody);
    virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
    AstNode* stmtsp() const { return op1p(); }
    void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
    ExecMTask* execMTaskp() const { return m_execMTaskp; }
    void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
    virtual void dump(std::ostream& str=std::cout);
 };
 class AstExecGraph : public AstNode {
    // For parallel execution, this node contains a dependency graph.  Each
    // node in the graph is an ExecMTask, which contains a body for the
    // mtask, which contains a set of AstActive's, each of which calls a
    // leaf AstCFunc. whew!
    //
    // The mtask bodies are also children of this node, so we can visit
    // them without traversing the graph (it's not always needed to
    // traverse the graph.)
 private:
    V3Graph *m_depGraphp;  // contains ExecMTask's
 public:
    explicit AstExecGraph(FileLine* fileline);
    ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
    virtual ~AstExecGraph();
    virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
    const V3Graph* depGraphp() const { return m_depGraphp; }
    V3Graph* mutableDepGraphp() { return m_depGraphp; }
    void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
 };
 class AstSplitPlaceholder : public AstNode {
 public:
    // Dummy node used within V3Split; never exists outside of V3Split.
@ -5749,12 +5792,14 @@ private:
    AstTypeTable* m_typeTablep;	// Reference to top type table, for faster lookup
    AstPackage*	  m_dollarUnitPkgp;
    AstCFunc*     m_evalp;      // The '_eval' function
    AstExecGraph* m_execGraphp;  // Execution MTask graph for threads>1 mode
 public:
    AstNetlist()
 	: AstNode(new FileLine("AstRoot",0))
 	, m_typeTablep(NULL)
 	, m_dollarUnitPkgp(NULL)
-	, m_evalp(NULL) { }
+        , m_evalp(NULL)
        , m_execGraphp(NULL) { }
    ASTNODE_NODE_FUNCS(Netlist)
    virtual const char* broken() const {
        BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
@ -5784,6 +5829,8 @@ public:
 	return m_dollarUnitPkgp; }
    AstCFunc* evalp() const { return m_evalp; }
    void evalp(AstCFunc* evalp) { m_evalp = evalp; }
    AstExecGraph* execGraphp() const { return m_execGraphp; }
    void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
 };
 //######################################################################
--- a/src/V3Clock.cpp
+++ b/src/V3Clock.cpp
@ -68,6 +68,7 @@ private:
    AstCFunc*		m_settleFuncp;	// Top settlement function we are creating
    AstSenTree*		m_lastSenp;	// Last sensitivity match, so we can detect duplicates.
    AstIf*		m_lastIfp;	// Last sensitivity if active to add more under
    AstMTaskBody*       m_mtaskBodyp;   // Current mtask body
    // METHODS
    VL_DEBUG_FUNC;  // Declare debug()
@ -338,6 +339,30 @@ private:
 	    // Only empty blocks should be leftover on the non-top.  Killem.
 	    if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
        } else if (m_mtaskBodyp) {
            UINFO(4,"  TR ACTIVE  "<<nodep<<endl);
            AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
            if (nodep->hasClocked()) {
                if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
                if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
                    UINFO(4,"    sameSenseTree\n");
                } else {
                    clearLastSen();
                    m_lastSenp = nodep->sensesp();
                    // Make a new if statement
                    m_lastIfp = makeActiveIf(m_lastSenp);
                    m_mtaskBodyp->addStmtsp(m_lastIfp);
                }
                // Move statements to if
                m_lastIfp->addIfsp(stmtsp);
            } else if (nodep->hasInitial() || nodep->hasSettle()) {
                nodep->v3fatalSrc("MTask should not include initial/settle logic.");
            } else {
                // Combo logic. Move statements to mtask func.
                clearLastSen();
                m_mtaskBodyp->addStmtsp(stmtsp);
            }
            nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	} else {
 	    UINFO(4,"  ACTIVE  "<<nodep<<endl);
 	    AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
@ -372,6 +397,20 @@ private:
 	    nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
 	}
    }
    virtual void visit(AstExecGraph* nodep) {
        for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
             m_mtaskBodyp;
             m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
            clearLastSen();
            iterate(m_mtaskBodyp);
        }
        clearLastSen();
        // Move the ExecGraph into _eval. Its location marks the
        // spot where the graph will execute, relative to other
        // (serial) logic in the cycle.
        nodep->unlinkFrBack();
        addToEvalLoop(nodep);
    }
    //--------------------
    // Default: Just iterate
@ -391,6 +430,7 @@ public:
        m_lastSenp = NULL;
 	m_lastIfp = NULL;
 	m_scopep = NULL;
        m_mtaskBodyp = NULL;
 	//
        iterate(nodep);
        // Allow downstream modules to find _eval()
--- a/src/V3EmitC.cpp
+++ b/src/V3EmitC.cpp
@ -34,6 +34,8 @@
 #include "V3EmitC.h"
 #include "V3EmitCBase.h"
 #include "V3Number.h"
 #include "V3PartitionGraph.h"
 #include "V3TSP.h"
 #define VL_VALUE_STRING_MAX_WIDTH 8192	// We use a static char array in VL_VALUE_STRING
@ -103,7 +105,13 @@ public:
 	    puts("["+cvtToStr(arrayp->elementsConst())+"]");
 	}
    }
-
+    void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
        string newVarCmt = varp->mtasksString();
        if (*curVarCmtp != newVarCmt) {
            *curVarCmtp = newVarCmt;
            puts("// Begin mtask footprint "+*curVarCmtp+"\n");
        }
    }
    void emitTypedefs(AstNode* firstp) {
 	bool first = true;
 	for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
@ -783,6 +791,50 @@ public:
    virtual ~EmitCStmts() {}
 };
 //######################################################################
 // Establish mtask variable sort order in mtasks mode
 class EmitVarTspSorter : public V3TSP::TspStateBase {
 private:
    // MEMBERS
    const MTaskIdSet& m_mtaskIds;  // Mtask we're ordering
    static unsigned m_serialNext;  // Unique ID to establish serial order
    unsigned m_serial;  // Serial ordering
 public:
    // CONSTRUCTORS
    explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
        : m_mtaskIds(mtaskIds),
          m_serial(++m_serialNext) {}
    virtual ~EmitVarTspSorter() {}
    // METHODS
    bool operator<(const TspStateBase& other) const {
        return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
    }
    bool operator<(const EmitVarTspSorter& other) const {
        return m_serial < other.m_serial;
    }
    const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
    virtual int cost(const TspStateBase* otherp) const {
        return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
    }
    virtual int cost(const EmitVarTspSorter* otherp) const {
        int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
        cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
        return cost;
    }
    // Returns the number of elements in set_a that don't appear in set_b
    static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
        int diffs = 0;
        for (MTaskIdSet::iterator it = set_a.begin();
             it != set_a.end(); ++it) {
            if (set_b.find(*it) == set_b.end()) ++diffs;
        }
        return diffs;
    }
 };
 unsigned EmitVarTspSorter::m_serialNext = 0;
 //######################################################################
 // Internal EmitC implementation
@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
 	return ofp;
    }
    // Returns the number of cross-thread dependencies into mtaskp.
    // If >0, mtaskp must test whether its prereqs are done before starting,
    // and may need to block.
    static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
        uint32_t result = 0;
        for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
            const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
            if (prevp->thread() != mtaskp->thread()) {
                ++result;
            }
        }
        return result;
    }
    void emitMTaskBody(AstMTaskBody* nodep) {
        ExecMTask* curExecMTaskp = nodep->execMTaskp();
        if (packedMTaskMayBlock(curExecMTaskp)) {
            puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
                 + ".waitUntilUpstreamDone(even_cycle);\n");
        }
        string recName;
        if (v3Global.opt.profThreads()) {
            recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
            puts("VlProfileRec* " + recName + " = NULL;\n");
            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
            puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
            puts(  recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
            puts(  recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
            puts(               " "+cvtToStr(curExecMTaskp->id())+ ",");
            puts(               " "+cvtToStr(curExecMTaskp->cost())+");\n");
            puts("}\n");
        }
        puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
        // The actual body of calls to leaf functions
        iterateAndNextNull(nodep->stmtsp());
        if (v3Global.opt.profThreads()) {
            // Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
            puts("if (VL_UNLIKELY("+recName+")) {\n");
            puts(  recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
            puts("}\n");
        }
        // Flush message queue
        puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
        // For any downstream mtask that's on another thread, bump its
        // counter and maybe notify it.
        for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
             edgep; edgep = edgep->outNextp()) {
            const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
            if (nextp->thread() != curExecMTaskp->thread()) {
                puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
                     + ".signalUpstreamDone(even_cycle);\n");
            }
        }
        // Run the next mtask inline
        const ExecMTask* nextp = curExecMTaskp->packNextp();
        if (nextp) {
            emitMTaskBody(nextp->bodyp());
        } else {
            // Unblock the fake "final" mtask
            puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
        }
    }
    virtual void visit(AstMTaskBody* nodep) {
        ExecMTask* mtp = nodep->execMTaskp();
        puts("\n");
        puts("void ");
        puts(modClassName(m_modp)+"::"+mtp->cFuncName());
        puts("(bool even_cycle, void* symtab) {\n");
        // Declare and set vlSymsp
        puts(EmitCBaseVisitor::symClassVar() + " = ("
             + EmitCBaseVisitor::symClassName() + "*)symtab;\n");
        puts(EmitCBaseVisitor::symTopAssign()+"\n");
        emitMTaskBody(nodep);
        puts("}\n");
    }
    //---------------------------------------
    // VISITORS
    using EmitCStmts::visit;  // Suppress hidden overloaded virtual function warning
@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
 	emitVarReset(varp);
    }
    virtual void visit(AstExecGraph* nodep) {
        if (nodep != v3Global.rootp()->execGraphp()) {
            nodep->v3fatalSrc("ExecGraph should be a singleton!");
        }
        // The location of the AstExecGraph within the containing _eval()
        // function is where we want to invoke the graph and wait for it to
        // complete. Do that now.
        //
        // Don't recurse to children -- this isn't the place to emit
        // function definitions for the nested CFuncs. We'll do that at the
        // end.
        puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
        // Build the list of initial mtasks to start
        std::vector<const ExecMTask*> execMTasks;
        // Start each root mtask
        for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
             vxp; vxp = vxp->verticesNextp()) {
            const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
            if (etp->threadRoot()) execMTasks.push_back(etp);
        }
        if (execMTasks.size() >
            static_cast<unsigned>(v3Global.opt.threads())) {
            nodep->v3fatalSrc("More root mtasks than available threads");
        }
        if (!execMTasks.empty()) {
            for (uint32_t i = 0; i < execMTasks.size(); ++i) {
                bool runInline = (i == execMTasks.size() - 1);
                if (runInline) {
                    // The thread calling eval() will run this mtask inline,
                    // along with its packed successors.
                    puts(execMTasks[i]->cFuncName()
                         + "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
                    puts("Verilated::mtaskId(0);\n");
                } else {
                    // The other N-1 go to the thread pool.
                    puts("vlTOPp->__Vm_threadPoolp->workerp("
                         + cvtToStr(i)+")->addTask("
                         + execMTasks[i]->cFuncName()
                         + ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
                }
            }
            puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
        }
    }
    //---------------------------------------
    // ACCESSORS
@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
    void emitStaticDecl(AstNodeModule* modp);
    void emitSettleLoop(const std::string& eval_call, bool initial);
    void emitWrapEval(AstNodeModule* modp);
    void emitMTaskState();
    void emitMTaskVertexCtors(bool* firstp);
    void emitInt(AstNodeModule* modp);
    void maybeSplit(AstNodeModule* modp);
@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
    }
 }
 void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
    const V3Graph* depGraphp = execGraphp->depGraphp();
    unsigned finalEdgesInCt = 0;
    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
         vxp; vxp = vxp->verticesNextp()) {
        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
        unsigned edgesInCt = packedMTaskMayBlock(mtp);
        if (packedMTaskMayBlock(mtp) > 0) {
            emitCtorSep(firstp);
            puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
        }
        // Each mtask with no packed successor will become a dependency
        // for the final node:
        if (!mtp->packNextp()) ++finalEdgesInCt;
    }
    emitCtorSep(firstp);
    puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
    // This will flip to 'true' before the start of the 0th cycle.
    emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
    if (v3Global.opt.profThreads()) {
        emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
    }
    emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
 }
 void EmitCImp::emitCtorImp(AstNodeModule* modp) {
    puts("\n");
    bool first = true;
@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
        first = false;  // VL_CTOR_IMP includes the first ':'
    }
    emitVarCtors(&first);
    if (modp->isTop() && v3Global.opt.mtasks()) {
        emitMTaskVertexCtors(&first);
    }
    puts(" {\n");
    emitCellCtors(modp);
    emitSensitives();
@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
    putsDecoration("// Reset structure values\n");
    puts("_ctor_var_reset();\n");
    emitTextSection(AstType::atScCtor);
    if (modp->isTop() && v3Global.opt.mtasks()) {
        // TODO-- For now each top module creates its own ThreadPool here,
        // and deletes it in the destructor. If A and B are each top level
        // modules, each creates a separate thread pool.  This allows
        // A.eval() and B.eval() to run concurrently without any
        // interference -- so long as the physical machine has enough cores
        // to support both pools and all testbench threads.
        //
        // In the future, we might want to let the client provide a
        // threadpool to the constructor. This would allow two or more
        // models to share a single threadpool.
        //
        // For example: suppose models A and B are each compiled to run on
        // 4 threads. The client might create a single thread pool with 3
        // threads and pass it to both models. If the client can ensure tht
        // A.eval() and B.eval() do NOT run concurrently, there will be no
        // contention for the threads. This mode is missing for now.  (Is
        // there demand for such a setup?)
        puts("__Vm_threadPoolp = new VlThreadPool("
             // Note we create N-1 threads in the thread pool. The thread
             // that calls eval() becomes the final Nth thread for the
             // duration of the eval call.
             + cvtToStr(v3Global.opt.threads() - 1)
             + ", " + cvtToStr(v3Global.opt.profThreads())
             + ");\n");
        if (v3Global.opt.profThreads()) {
            puts("__Vm_profile_cycle_start = 0;\n");
            puts("__Vm_profile_time_finished = 0;\n");
            puts("__Vm_profile_window_ct = 0;");
        }
    }
    puts("}\n");
 }
@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
 void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
    puts("\n");
    puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
    if (modp->isTop() && v3Global.opt.mtasks()) {
        puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
    }
    emitTextSection(AstType::atScDtor);
    if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
    puts("}\n");
@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
    if (v3Global.opt.threads() == 1) {
 	uint32_t mtaskId = 0;
 	putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
-	puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
+        puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
 	puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
    }
    if (v3Global.opt.mtasks()
        && v3Global.opt.profThreads()) {
        puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
        puts(                 " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
        puts(                 " && (Verilated::profThreadsWindow() >= 1))) {\n");
        // Within a profile (either starting, middle, or end)
        puts(    "if (vlTOPp->__Vm_profile_window_ct == 0) {\n");  // Opening file?
        // Start profile on this cycle. We'll capture a window worth, then
        // only analyze the next window worth. The idea is that the first window
        // capture will hit some cache-cold stuff (eg printf) but it'll be warm
        // by the time we hit the second window, we hope.
        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
        // "* 2" as first half is warmup, second half is collection
        puts(        "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
        puts(    "}\n");
        puts(    "--vlTOPp->__Vm_profile_window_ct;\n");
        puts(    "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
        // This barrier record in every threads' profile demarcates the
        // cache-warm-up cycles before the barrier from the actual profile
        // cycles afterward.
        puts(        "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
        puts(                       "VlProfileRec(VlProfileRec::Barrier()));\n");
        puts(        "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
        puts(    "}\n");
        puts(    "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
        // Ending file.
        puts(        "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
        puts(        "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
        // This turns off the test to enter the profiling code, but still
        // allows the user to collect another profile by changing
        // profThreadsStart
        puts(        "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
        puts(        "vlTOPp->__Vm_profile_cycle_start = 0;\n");
        puts(    "}\n");
        puts("}\n");
    }
    emitSettleLoop(
        (string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
         + (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
    // Put out a list of signal declarations
    // in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
    // This aids cache packing and locality
    // Largest->smallest reduces the number of pad variables.
    // But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
    // TODO: Move this sort to an earlier visitor stage.
    //
    // Largest->smallest reduces the number of pad variables.  Also
    // experimented with alternating between large->small and small->large
    // on successive Mtask groups, but then when a new mtask gets added may
    // cause a huge delta.
    //
    // TODO: Move this sort to an earlier visitor stage.
    VarSortMap varAnonMap;
    VarSortMap varNonanonMap;
@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
 void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
    UASSERT(sortedp->empty(), "Sorted should be initially empty");
-    {
+    if (!v3Global.opt.mtasks()) {
-        // Plain old serial mode. Sort by size, from small to large.
+        // Plain old serial mode. Sort by size, from small to large,
        // to optimize for both packing and small offsets in code.
        for (VarSortMap::const_iterator it = vmap.begin();
             it != vmap.end(); ++it) {
            for (VarVec::const_iterator jt = it->second.begin();
@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
                sortedp->push_back(*jt);
            }
        }
        return;
    }
    // MacroTask mode.  Sort by MTask-affinity group first, size second.
    typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
    MTaskVarSortMap m2v;
    for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
        int size_class = it->first;
        const VarVec& vec = it->second;
        for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
            const AstVar* varp = *jt;
            m2v[varp->mtaskIds()][size_class].push_back(varp);
        }
    }
    // Create a TSP sort state for each MTaskIdSet footprint
    V3TSP::StateVec states;
    for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
        states.push_back(new EmitVarTspSorter(it->first));
    }
    // Do the TSP sort
    V3TSP::StateVec sorted_states;
    V3TSP::tspSort(states, &sorted_states);
    for (V3TSP::StateVec::iterator it = sorted_states.begin();
         it != sorted_states.end(); ++it) {
        const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
        const VarSortMap& localVmap = m2v[statep->mtaskIds()];
        // use rbegin/rend to sort size large->small
        for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
             jt != localVmap.rend(); ++jt) {
            const VarVec& vec = jt->second;
            for (VarVec::const_iterator kt = vec.begin();
                 kt != vec.end(); ++kt) {
                sortedp->push_back(*kt);
            }
        }
        delete statep; VL_DANGLING(statep);
    }
 }
 void EmitCStmts::emitSortedVarList(const VarVec& anons,
                                   const VarVec& nonanons,
                                   const string& prefixIfImp) {
    string curVarCmt = "";
    // Output anons
    {
        int anonMembers = anons.size();
@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
                    if (anonL1s != 1) puts("struct {\n");
                    for (int l0=0; l0<lim && it != anons.end(); ++l0) {
                        const AstVar* varp = *it;
                        emitVarCmtChg(varp, &curVarCmt);
                        emitVarDecl(varp, prefixIfImp);
                        ++it;
                    }
@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
        // Leftovers, just in case off by one error somewhere above
        for (; it != anons.end(); ++it) {
            const AstVar* varp = *it;
            emitVarCmtChg(varp, &curVarCmt);
            emitVarDecl(varp, prefixIfImp);
        }
    }
    // Output nonanons
    for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
        const AstVar* varp = *it;
        emitVarCmtChg(varp, &curVarCmt);
        emitVarDecl(varp, prefixIfImp);
    }
 }
@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
 	    if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
 	}
    }
    if (modp->isTop() && v3Global.opt.mtasks()) {
        // Emit the mtask func prototypes.
        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
        if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
        const V3Graph* depGraphp = execGraphp->depGraphp();
        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
             vxp; vxp = vxp->verticesNextp()) {
            const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
            if (mtp->threadRoot()) {
                // Emit function declaration for this mtask
                ofp()->putsPrivate(true);
                puts("static void "); puts(mtp->cFuncName());
                puts("(bool even_cycle, void* symtab);\n");
            }
        }
        // No AstCFunc for this one, as it's synthetic. Just write it:
        puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
    }
 }
 void EmitCImp::emitMTaskState() {
    ofp()->putsPrivate(true);
    AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
    if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
    const V3Graph* depGraphp = execGraphp->depGraphp();
    for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
         vxp; vxp = vxp->verticesNextp()) {
        const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
        if (packedMTaskMayBlock(mtp) > 0) {
            puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
        }
    }
    // This fake mtask depends on all the real ones.  We use it to block
    // eval() until all mtasks are done.
    //
    // In the future we might allow _eval() to return before the graph is
    // fully done executing, for "half wave" scheduling. For now we wait
    // for all mtasks though.
    puts("VlMTaskVertex __Vm_mt_final;\n");
    puts("VlThreadPool* __Vm_threadPoolp;\n");
    if (v3Global.opt.profThreads()) {
        // rdtsc() at current cycle start
        puts("vluint64_t __Vm_profile_cycle_start;\n");
        // Time we finished analysis
        puts("vluint64_t __Vm_profile_time_finished;\n");
        // Track our position in the cache warmup and actual profile window
        puts("vluint32_t __Vm_profile_window_ct;\n");
    }
    puts("bool __Vm_even_cycle;\n");
 }
 void EmitCImp::emitInt(AstNodeModule* modp) {
@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
    } else {
 	puts("#include \"verilated.h\"\n");
    }
    if (v3Global.opt.mtasks()) {
        puts("#include \"verilated_threads.h\"\n");
    }
    if (v3Global.opt.savable()) {
 	puts("#include \"verilated_save.h\"\n");
    }
@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
 	    puts("bool __Vm_inhibitSim;  ///< Set true to disable evaluation of module\n");
 	}
    }
    if (modp->isTop() && v3Global.opt.mtasks()) {
        emitMTaskState();
    }
    emitCoverageDecl(modp);	// may flip public/private
    puts("\n// PARAMETERS\n");
@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
 	}
    }
    if (fast && modp->isTop() && v3Global.opt.mtasks()) {
        // Make a final pass and emit function definitions for the mtasks
        // in the ExecGraph
        AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
        const V3Graph* depGraphp = execGraphp->depGraphp();
        for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
             vxp; vxp = vxp->verticesNextp()) {
            const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
            if (mtaskp->threadRoot()) {
                maybeSplit(modp);
                // Only define one function for all the mtasks packed on
                // a given thread. We'll name this function after the
                // root mtask though it contains multiple mtasks' worth
                // of logic.
                iterate(mtaskp->bodyp());
            }
        }
    }
    delete m_ofp; m_ofp=NULL;
 }
--- a/src/V3EmitMk.cpp
+++ b/src/V3EmitMk.cpp
@ -94,6 +94,9 @@ public:
 			    putMakeClassEntry(of, "verilated_vcd_sc.cpp");
 			}
 		    }
                    if (v3Global.opt.mtasks()) {
                        putMakeClassEntry(of, "verilated_threads.cpp");
                    }
 		}
 		else if (support==2 && slow) {
 		}
--- a/src/V3Error.h
+++ b/src/V3Error.h
@ -131,7 +131,7 @@ public:
 	    "ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
 	    "BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
 	    "CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
-	    "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
+            "CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
 	    "ENDLABEL", "GENCLK",
 	    "IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
            "INCABSPATH", "INFINITELOOP", "INITIALDLY",
--- a/src/V3LifePost.cpp
+++ b/src/V3LifePost.cpp
@ -37,6 +37,8 @@
 #include VL_INCLUDE_UNORDERED_MAP
 #include "V3Global.h"
 #include "V3PartitionGraph.h"
 #include "V3GraphPathChecker.h"
 #include "V3LifePost.h"
 #include "V3Stats.h"
 #include "V3Ast.h"
@ -78,6 +80,11 @@ private:
            iterate(nodep->funcp());
        }
    }
    virtual void visit(AstExecGraph* nodep) {
        // Can just iterate across the MTask bodies in any order.  Order
        // isn't important for LifePostElimVisitor's simple substitution.
        iterateChildren(nodep);
    }
    virtual void visit(AstCFunc* nodep) {
        if (!m_tracingCall && !nodep->entryPoint()) return;
        m_tracingCall = false;
@ -101,11 +108,17 @@ public:
 // and a sequence number within the mtask:
 struct LifeLocation {
    const ExecMTask* mtaskp;
    uint32_t sequence;
 public:
-    LifeLocation() : sequence(0) {}
+    LifeLocation() : mtaskp(NULL), sequence(0) {}
-    LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
+    LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
        : mtaskp(mtaskp_), sequence(sequence_) {}
    bool operator< (const LifeLocation& b) const {
        unsigned a_id = mtaskp ? mtaskp->id() : 0;
        unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
        if (a_id < b_id) { return true; }
        if (b_id < a_id) { return false; }
        return sequence < b.sequence;
    }
 };
@ -130,6 +143,9 @@ private:
    // STATE
    uint32_t            m_sequence;     // Sequence number of assigns/varrefs,
    //                                  // local to the current MTask.
    const ExecMTask*    m_execMTaskp;   // Current ExecMTask being processed,
    //                                  // or NULL for serial code.
    V3Double0           m_statAssnDel;  // Statistic tracking
    bool                m_tracingCall;  // Currently tracing a CCall to a CFunc
@ -143,11 +159,15 @@ private:
    typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
    PostLocMap          m_assignposts;  // AssignPost dly var locations
    const V3Graph*      m_mtasksGraphp;  // Mtask tracking graph
    vl_unique_ptr<GraphPathChecker> m_checker;
    // METHODS
    VL_DEBUG_FUNC;  // Declare debug()
-    static bool before(const LifeLocation& a, const LifeLocation& b) {
+    bool before(const LifeLocation& a, const LifeLocation& b) {
-        return a.sequence < b.sequence;
+        if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
        return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
    }
    bool outsideCriticalArea(LifeLocation loc,
                             const std::set<LifeLocation>& dlyVarAssigns,
@ -159,6 +179,13 @@ private:
        // Otherwise, loc could fall in the "critical" area where the
        // substitution affects the result of the operation at loc, so
        // return false.
        if (!loc.mtaskp && assignPostLoc.mtaskp) {
            // This is threaded mode; 'loc' is something that happens at
            // initial/settle time, or perhaps in _eval() but outside of
            // the mtask graph.
            // In either case, it's not in the critical area.
            return true;
        }
        if (before(assignPostLoc, loc)) return true;
        for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
             it != dlyVarAssigns.end(); ++it) {
@ -239,6 +266,17 @@ private:
        // within the mtask) where each varscope is read, and written.
        iterateChildren(nodep);
        if (v3Global.opt.mtasks()) {
            if (!m_mtasksGraphp) {
                nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
            }
            m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
        } else {
            if (m_mtasksGraphp) {
                nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
            }
        }
        // Find all assignposts. Determine which ones can be
        // eliminated. Remove those, and mark their dly vars' user4 field
        // to indicate we should replace these dly vars with their original
@ -252,7 +290,8 @@ private:
        // Consumption/generation of a variable,
        AstVarScope* vscp = nodep->varScopep();
        if (!vscp) nodep->v3fatalSrc("Scope not assigned");
-        LifeLocation loc(++m_sequence);
+
        LifeLocation loc(m_execMTaskp, ++m_sequence);
        if (nodep->lvalue()) {
            m_writes[vscp].insert(loc);
        } else {
@ -275,7 +314,7 @@ private:
            if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
                nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
            }
-            LifeLocation loc(++m_sequence);
+            LifeLocation loc(m_execMTaskp, ++m_sequence);
            m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
        }
    }
@ -291,6 +330,18 @@ private:
            iterate(nodep->funcp());
        }
    }
    virtual void visit(AstExecGraph* nodep) {
        // Treat the ExecGraph like a call to each mtask body
        m_mtasksGraphp = nodep->depGraphp();
        for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
             mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
            ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
            m_execMTaskp = mtaskp;
            m_sequence = 0;
            iterate(mtaskp->bodyp());
        }
        m_execMTaskp = NULL;
    }
    virtual void visit(AstCFunc* nodep) {
        if (!m_tracingCall && !nodep->entryPoint()) return;
        m_tracingCall = false;
@ -305,7 +356,9 @@ public:
    // CONSTRUCTORS
    explicit LifePostDlyVisitor(AstNetlist* nodep)
        : m_sequence(0)
-        , m_tracingCall(false) {
+        , m_execMTaskp(NULL)
        , m_tracingCall(false)
        , m_mtasksGraphp(NULL) {
        iterate(nodep);
    }
    virtual ~LifePostDlyVisitor() {
--- a/src/V3Options.cpp
+++ b/src/V3Options.cpp
@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-debug-abort") )		{ abort(); } // Undocumented, see also --debug-sigsegv
 	    else if ( onoff   (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
            else if ( onoff   (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
            else if ( onoff   (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
            else if ( onoff   (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; }  // Undocumented
            else if ( onoff   (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; }  // Undocumented
 	    else if ( !strcmp (sw, "-debug-sigsegv") )		{ throwSigsegv(); }  // Undocumented, see also --debug-abort
 	    else if ( !strcmp (sw, "-debug-fatalsrc") )		{ v3fatalSrc("--debug-fatal-src"); }  // Undocumented, see also --debug-abort
 	    else if ( onoff   (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( !strcmp (sw, "-private") )		{ m_public = false; }
            else if ( onoff   (sw, "-prof-cfuncs", flag/*ref*/) )       { m_profCFuncs = flag; }
            else if ( onoff   (sw, "-profile-cfuncs", flag/*ref*/) )    { m_profCFuncs = flag; }  // Undocumented, for backward compat
            else if ( onoff   (sw, "-prof-threads", flag/*ref*/) )      { m_profThreads = flag; }
 	    else if ( onoff   (sw, "-public", flag/*ref*/) )		{ m_public = flag; }
            else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+")))	{ addParameter(string(sw+strlen("-pvalue+")), false); }
            else if ( onoff   (sw, "-relative-cfuncs", flag/*ref*/) )   { m_relativeCFuncs = flag; }
@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 	    else if ( onoff   (sw, "-stats", flag/*ref*/) )		{ m_stats = flag; }
 	    else if ( onoff   (sw, "-stats-vars", flag/*ref*/) )	{ m_statsVars = flag; m_stats |= flag; }
 	    else if ( !strcmp (sw, "-sv") )				{ m_defaultLanguage = V3LangCode::L1800_2005; }
            else if ( onoff   (sw, "-threads-coarsen", flag/*ref*/))    { m_threadsCoarsen = flag; }  // Undocumented, debug
 	    else if ( onoff   (sw, "-trace", flag/*ref*/) )		{ m_trace = flag; }
 	    else if ( onoff   (sw, "-trace-dups", flag/*ref*/) )	{ m_traceDups = flag; }
 	    else if ( onoff   (sw, "-trace-params", flag/*ref*/) )	{ m_traceParams = flag; }
@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
 		shift; m_threads = atoi(argv[i]);
 		if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
 	    }
            else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
                shift;
                if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
                else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
                else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
                else {
                    fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
                }
            }
            else if ( !strcmp (sw, "-threads-max-mtasks") ) {
                shift; m_threadsMaxMTasks = atoi(argv[i]);
                if (m_threadsMaxMTasks < 1)
                    fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
            }
 	    else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
 		shift; m_topModule = argv[i];
 	    }
@ -1223,6 +1242,9 @@ V3Options::V3Options() {
    m_coverageUser = false;
    m_debugCheck = false;
    m_debugLeak = true;
    m_debugNondeterminism = false;
    m_debugPartition = false;
    m_debugSelfTest = false;
    m_decoration = true;
    m_exe = false;
    m_ignc = false;
@ -1237,6 +1259,7 @@ V3Options::V3Options() {
    m_pinsScBigUint = false;
    m_pinsUint8 = false;
    m_profCFuncs = false;
    m_profThreads = false;
    m_preprocOnly = false;
    m_preprocNoLine = false;
    m_public = false;
@ -1249,6 +1272,10 @@ V3Options::V3Options() {
    m_statsVars = false;
    m_systemC = false;
    m_threads = 0;
    m_threadsDpiPure = true;
    m_threadsDpiUnpure = false;
    m_threadsCoarsen = true;
    m_threadsMaxMTasks = 0;
    m_trace = false;
    m_traceDups = false;
    m_traceParams = true;
--- a/src/V3Options.h
+++ b/src/V3Options.h
@ -75,7 +75,10 @@ class V3Options {
    bool	m_coverageUnderscore;// main switch: --coverage-underscore
    bool	m_coverageUser;	// main switch: --coverage-func
    bool	m_debugCheck;	// main switch: --debug-check
-    bool        m_debugLeak;   // main switch: --debug-leak
+    bool        m_debugLeak;    // main switch: --debug-leak
    bool        m_debugNondeterminism;  // main switch: --debug-nondeterminism
    bool        m_debugPartition;  // main switch: --debug-partition
    bool        m_debugSelfTest;  // main switch: --debug-self-test
    bool	m_decoration;	// main switch: --decoration
    bool	m_exe;		// main switch: --exe
    bool	m_ignc;		// main switch: --ignc
@ -87,6 +90,7 @@ class V3Options {
    bool	m_pinsScBigUint;// main switch: --pins-sc-biguint
    bool	m_pinsUint8;	// main switch: --pins-uint8
    bool        m_profCFuncs;   // main switch: --prof-cfuncs
    bool        m_profThreads;  // main switch: --prof-threads
    bool	m_public;	// main switch: --public
    bool	m_relativeCFuncs; // main switch: --relative-cfuncs
    bool	m_relativeIncludes; // main switch: --relative-includes
@ -96,6 +100,9 @@ class V3Options {
    bool	m_skipIdentical;// main switch: --skip-identical
    bool	m_stats;	// main switch: --stats
    bool	m_statsVars;	// main switch: --stats-vars
    bool        m_threadsCoarsen;  // main switch: --threads-coarsen
    bool        m_threadsDpiPure;  // main switch: --threads-dpi all/pure
    bool        m_threadsDpiUnpure;  // main switch: --threads-dpi all
    bool	m_trace;	// main switch: --trace
    bool	m_traceDups;	// main switch: --trace-dups
    bool	m_traceParams;	// main switch: --trace-params
@ -117,6 +124,7 @@ class V3Options {
    int		m_outputSplitCTrace;// main switch: --output-split-ctrace
    int		m_pinsBv;	// main switch: --pins-bv
    int		m_threads;	// main switch: --threads (0 == --no-threads)
    int         m_threadsMaxMTasks;  // main switch: --threads-max-mtasks
    int		m_traceDepth;	// main switch: --trace-depth
    int		m_traceMaxArray;// main switch: --trace-max-array
    int		m_traceMaxWidth;// main switch: --trace-max-width
@ -232,8 +240,14 @@ class V3Options {
    bool coverageUser() const { return m_coverageUser; }
    bool debugCheck() const { return m_debugCheck; }
    bool debugLeak() const { return m_debugLeak; }
    bool debugNondeterminism() const { return m_debugNondeterminism; }
    bool debugPartition() const { return m_debugPartition; }
    bool debugSelfTest() const { return m_debugSelfTest; }
    bool decoration() const { return m_decoration; }
    bool exe() const { return m_exe; }
    bool threadsDpiPure() const { return m_threadsDpiPure; }
    bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
    bool threadsCoarsen() const { return m_threadsCoarsen; }
    bool trace() const { return m_trace; }
    bool traceDups() const { return m_traceDups; }
    bool traceParams() const { return m_traceParams; }
@ -246,6 +260,7 @@ class V3Options {
    bool pinsScBigUint() const { return m_pinsScBigUint; }
    bool pinsUint8() const { return m_pinsUint8; }
    bool profCFuncs() const { return m_profCFuncs; }
    bool profThreads() const { return m_profThreads; }
    bool allPublic() const { return m_public; }
    bool lintOnly() const { return m_lintOnly; }
    bool ignc() const { return m_ignc; }
@ -267,6 +282,7 @@ class V3Options {
    int	   outputSplitCTrace() const { return m_outputSplitCTrace; }
    int	   pinsBv() const { return m_pinsBv; }
    int threads() const { return m_threads; }
    int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
    bool mtasks() const { return (m_threads > 1); }
    int	   traceDepth() const { return m_traceDepth; }
    int	   traceMaxArray() const { return m_traceMaxArray; }
--- a/src/V3Order.cpp
+++ b/src/V3Order.cpp
@ -89,19 +89,22 @@
 #include <sstream>
 #include <memory>
 #include "V3Global.h"
 #include "V3File.h"
 #include "V3Ast.h"
 #include "V3Const.h"
 #include "V3EmitCBase.h"
 #include "V3EmitV.h"
 #include "V3File.h"
 #include "V3Global.h"
 #include "V3Graph.h"
 #include "V3GraphStream.h"
 #include "V3List.h"
 #include "V3Partition.h"
 #include "V3PartitionGraph.h"
 #include "V3SenTree.h"
 #include "V3Stats.h"
 #include "V3EmitCBase.h"
 #include "V3Const.h"
 #include "V3Order.h"
 #include "V3OrderGraph.h"
 #include "V3EmitV.h"
 #include VL_INCLUDE_UNORDERED_MAP
 #include VL_INCLUDE_UNORDERED_SET
@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
    // OrderVisitor. It produces a slightly coarsened graph to drive the
    // code scheduling.
    //
-    // * The new graph contains nodes of type OrderMoveVertex.
+    // * For the serial code scheduler, the new graph contains
    //   nodes of type OrderMoveVertex.
    //
    // * For the threaded code scheduler, the new graph contains
    //   nodes of type MTaskMoveVertex.
    //
    // * The difference in output type is abstracted away by the
-    //   'T_MoveVertex' template parameter.
+    //   'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
    //   works the same way for both cases.
    // TYPES
    typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
@ -563,7 +571,7 @@ private:
 };
 //######################################################################
-// OrderMoveVertexMaker
+// OrderMoveVertexMaker and related
 class OrderMoveVertexMaker
    : public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
@ -595,6 +603,64 @@ private:
    VL_UNCOPYABLE(OrderMoveVertexMaker);
 };
 class OrderMTaskMoveVertexMaker
    : public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
    V3Graph* m_pomGraphp;
 public:
    explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
        : m_pomGraphp(pomGraphp) {}
    MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
                                 const OrderEitherVertex* varVertexp,
                                 const AstScope* scopep,
                                 const AstSenTree* domainp) {
        // Exclude initial/settle logic from the mtasks graph.
        // We'll output time-zero logic separately.
        if (domainp->hasInitial() || domainp->hasSettle()) {
            return NULL;
        }
        return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
    }
    void freeVertexp(MTaskMoveVertex* freeMep) {
        freeMep->unlinkDelete(m_pomGraphp);
    }
 private:
    VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
 };
 class OrderVerticesByDomainThenScope {
    PartPtrIdMap m_ids;
 public:
    virtual bool operator()(const V3GraphVertex* lhsp,
                            const V3GraphVertex* rhsp) const {
        const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
        const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
        vluint64_t l_id = m_ids.findId(l_vxp->domainp());
        vluint64_t r_id = m_ids.findId(r_vxp->domainp());
        if (l_id < r_id) return true;
        if (l_id > r_id) return false;
        l_id = m_ids.findId(l_vxp->scopep());
        r_id = m_ids.findId(r_vxp->scopep());
        return l_id < r_id;
    }
 };
 class MTaskVxIdLessThan {
 public:
    MTaskVxIdLessThan() {}
    virtual ~MTaskVxIdLessThan() {}
    // Sort vertex's, which must be AbstractMTask's, into a deterministic
    // order by comparing their serial IDs.
    virtual bool operator()(const V3GraphVertex* lhsp,
                            const V3GraphVertex* rhsp) const {
        const AbstractMTask* lmtaskp =
            dynamic_cast<const AbstractLogicMTask*>(lhsp);
        const AbstractMTask* rmtaskp =
            dynamic_cast<const AbstractLogicMTask*>(rhsp);
        return lmtaskp->id() < rmtaskp->id();
    }
 };
 //######################################################################
 // Order class functions
@ -701,6 +767,7 @@ private:
    void processDomainsIterate(OrderEitherVertex* vertexp);
    void processEdgeReport();
    // processMove* routines schedule serial execution
    void processMove();
    void processMoveClear();
    void processMoveBuildGraph();
@ -711,6 +778,18 @@ private:
    AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
                                   AstCFunc*& newFuncpr, int& newStmtsr);
    // processMTask* routines schedule threaded execution
    struct MTaskState {
        typedef std::list<const OrderLogicVertex*> Logics;
        AstMTaskBody* m_mtaskBodyp;
        Logics m_logics;
        ExecMTask* m_execMTaskp;
        MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
    };
    void processMTasks();
    typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
    void processMTasksInitial(InitialLogicE logic_type);
    string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
 	modp->user3Inc();
 	int funcnum = modp->user3();
@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
    return activep;
 }
 void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
    // Emit initial/settle logic. Initial blocks won't be part of the
    // mtask partition, aren't eligible for parallelism.
    //
    int initStmts = 0;
    AstCFunc* initCFunc = NULL;
    AstScope* lastScopep = NULL;
    for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
         initVxp; initVxp = initVxp->verticesNextp()) {
        OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
        if (!initp) continue;
        if ((logic_type == LOGIC_INITIAL)
            && !initp->domainp()->hasInitial()) continue;
        if ((logic_type == LOGIC_SETTLE)
            && !initp->domainp()->hasSettle()) continue;
        if (initp->scopep() != lastScopep) {
            // Start new cfunc, don't let the cfunc cross scopes
            initCFunc = NULL;
            lastScopep = initp->scopep();
        }
        AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
        if (newActivep) m_scopetopp->addActivep(newActivep);
    }
 }
 void OrderVisitor::processMTasks() {
    // For nondeterminism debug:
    V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
    processMTasksInitial(LOGIC_INITIAL);
    processMTasksInitial(LOGIC_SETTLE);
    // We already produced a graph of every var, input, logic, and settle
    // block and all dependencies; this is 'm_graph'.
    //
    // Now, starting from m_graph, make a slightly-coarsened graph representing
    // only logic, and discarding edges we know we can ignore.
    // This is quite similar to the 'm_pomGraph' of the serial code gen:
    V3Graph logicGraph;
    OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
    ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
        &m_graph, &logicGraph, &create_mtask_vertex);
    mtask_pmbg.build();
    // Needed? We do this for m_pomGraph in serial mode, so do it here too:
    logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
    // Partition logicGraph into LogicMTask's. The partitioner will annotate
    // each vertex in logicGraph with a 'color' which is really an mtask ID
    // in this context.
    V3Partition partitioner(&logicGraph);
    V3Graph mtasks;
    partitioner.go(&mtasks);
    vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
    // Iterate through the entire logicGraph. For each logic node,
    // attach it to a per-MTask ordered list of logic nodes.
    // This is the order we'll execute logic nodes within the MTask.
    //
    // MTasks may span scopes and domains, so sort by both here:
    GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
    const V3GraphVertex* moveVxp;
    while ((moveVxp = emit_logic.nextp())) {
        const MTaskMoveVertex* movep =
            dynamic_cast<const MTaskMoveVertex*>(moveVxp);
        unsigned mtaskId = movep->color();
        UASSERT(mtaskId > 0,
                "Every MTaskMoveVertex should have an mtask assignment >0");
        if (movep->logicp()) {
            // Add this logic to the per-mtask order
            mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
            // Since we happen to be iterating over every logic node,
            // take this opportunity to annotate each AstVar with the id's
            // of mtasks that consume it and produce it. We'll use this
            // information in V3EmitC when we lay out var's in memory.
            const OrderLogicVertex* logicp = movep->logicp();
            for (const V3GraphEdge* edgep = logicp->inBeginp();
                 edgep; edgep = edgep->inNextp()) {
                const OrderVarVertex* pre_varp =
                    dynamic_cast<const OrderVarVertex*>(edgep->fromp());
                if (!pre_varp) continue;
                AstVar* varp = pre_varp->varScp()->varp();
                // varp depends on logicp, so logicp produces varp,
                // and vice-versa below
                varp->addProducingMTaskId(mtaskId);
            }
            for (const V3GraphEdge* edgep = logicp->outBeginp();
                 edgep; edgep = edgep->outNextp()) {
                const OrderVarVertex* post_varp
                    = dynamic_cast<const OrderVarVertex*>(edgep->top());
                if (!post_varp) continue;
                AstVar* varp = post_varp->varScp()->varp();
                varp->addConsumingMTaskId(mtaskId);
            }
            // TODO? We ignore IO vars here, so those will have empty mtask
            // signatures. But we could also give those mtask signatures.
        }
    }
    // Create the AstExecGraph node which represents the execution
    // of the MTask graph.
    FileLine* rootFlp = new FileLine("AstRoot", 0);
    AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
    m_scopetopp->addActivep(execGraphp);
    v3Global.rootp()->execGraphp(execGraphp);
    // Create CFuncs and bodies for each MTask.
    GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
    const V3GraphVertex* mtaskVxp;
    while ((mtaskVxp = emit_mtasks.nextp())) {
        const AbstractLogicMTask* mtaskp =
            dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
        // Create a body for this mtask
        AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
        MTaskState& state = mtaskStates[mtaskp->id()];
        state.m_mtaskBodyp = bodyp;
        // Create leaf CFunc's to run this mtask's logic,
        // and create a set of AstActive's to call those CFuncs.
        // Add the AstActive's into the AstMTaskBody.
        const AstSenTree* last_domainp = NULL;
        AstCFunc* leafCFuncp = NULL;
        int leafStmts = 0;
        for (MTaskState::Logics::iterator it = state.m_logics.begin();
             it != state.m_logics.end(); ++it) {
            const OrderLogicVertex* logicp = *it;
            if (logicp->domainp() != last_domainp) {
                // Start a new leaf function.
                leafCFuncp = NULL;
            }
            last_domainp = logicp->domainp();
            AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
            if (newActivep) bodyp->addStmtsp(newActivep);
        }
        // Translate the LogicMTask graph into the corresponding ExecMTask
        // graph, which will outlive V3Order and persist for the remainder
        // of verilator's processing.
        // - The LogicMTask graph points to MTaskMoveVertex's
        //   and OrderLogicVertex's which are ephemeral to V3Order.
        // - The ExecMTask graph and the AstMTaskBody's produced here
        //   persist until code generation time.
        state.m_execMTaskp =
            new ExecMTask(execGraphp->mutableDepGraphp(),
                          bodyp, mtaskp->id());
        // Cross-link each ExecMTask and MTaskBody
        //  Q: Why even have two objects?
        //  A: One is an AstNode, the other is a GraphVertex,
        //     to combine them would involve multiple inheritance...
        state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
        for (V3GraphEdge* inp = mtaskp->inBeginp();
             inp; inp = inp->inNextp()) {
            const V3GraphVertex* fromVxp = inp->fromp();
            const AbstractLogicMTask* fromp =
                dynamic_cast<const AbstractLogicMTask*>(fromVxp);
            MTaskState& fromState = mtaskStates[fromp->id()];
            new V3GraphEdge(execGraphp->mutableDepGraphp(),
                            fromState.m_execMTaskp, state.m_execMTaskp, 1);
        }
        execGraphp->addMTaskBody(bodyp);
    }
 }
 //######################################################################
 // OrderVisitor - Top processing
@ -1762,7 +2008,7 @@ void OrderVisitor::process() {
    if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
-    {
+    if (!v3Global.opt.mtasks()) {
        UINFO(2,"  Construct Move Graph...\n");
        processMoveBuildGraph();
        if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start");  // Different prefix (ordermv) as it's not the same graph
@ -1771,6 +2017,9 @@ void OrderVisitor::process() {
        UINFO(2,"  Move...\n");
        processMove();
    } else {
        UINFO(2,"  Set up mtasks...\n");
        processMTasks();
    }
    // Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
--- a/src/V3OrderGraph.h
+++ b/src/V3OrderGraph.h
@ -21,6 +21,7 @@
 //
 //	V3GraphVertex
 //	  OrderMoveVertex
 //        MTaskMoveVertex
 //	  OrderEitherVertex
 //	    OrderInputsVertex
 //	    OrderSettleVertex
@ -47,6 +48,7 @@
 #include "verilatedos.h"
 #include "V3Ast.h"
 #include "V3Graph.h"
 #include VL_INCLUDE_UNORDERED_MAP
 class OrderVisitor;
 class OrderMoveVertex;
@ -363,6 +365,57 @@ public:
    void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
 };
 // Similar to OrderMoveVertex, but modified for threaded code generation.
 class MTaskMoveVertex : public V3GraphVertex {
    //  This could be more compact, since we know m_varp and m_logicp
    //  cannot both be set. Each MTaskMoveVertex represents a logic node
    //  or a var node, it can't be both.
    OrderLogicVertex* m_logicp;  // Logic represented by this vertex
    const OrderEitherVertex* m_varp;  // Var represented by this vertex
    const AstScope* m_scopep;
    const AstSenTree* m_domainp;
 protected:
    friend class OrderVisitor;
    friend class MTaskMoveVertexMaker;
 public:
    MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
                    const OrderEitherVertex* varp,
                    const AstScope* scopep, const AstSenTree* domainp)
        : V3GraphVertex(graphp), m_logicp(logicp),
          m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
        UASSERT(!(logicp && varp),
                "MTaskMoveVertex: logicp and varp may not both be set!\n");
    }
    virtual ~MTaskMoveVertex() {}
    virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
      v3fatalSrc("Unsupported"); return NULL; }
    virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
    virtual string dotColor() const {
        if (logicp()) return logicp()->dotColor();
        else return "yellow";
    }
    virtual string name() const {
        string nm;
        if (logicp()) {
            nm = logicp()->name();
            nm += (string("\\nMV:")
                   +" d="+cvtToStr((void*)logicp()->domainp())
                   +" s="+cvtToStr((void*)logicp()->scopep())
                   // "color()" represents the mtask ID.
                   +"\\nt="+cvtToStr(color()));
        } else {
            nm = "nolog\\nt="+cvtToStr(color());
        }
        return nm;
    }
    // ACCESSORS
    OrderLogicVertex* logicp() const { return m_logicp; }
    const OrderEitherVertex* varp() const { return m_varp; }
    const AstScope* scopep() const { return m_scopep; }
    const AstSenTree* domainp() const { return m_domainp; }
 };
 //######################################################################
 // Edge types
--- a/src/V3Partition.cpp
+++ b/src/V3Partition.cpp
--- a/src/V3Partition.h
+++ b/src/V3Partition.h
@ -0,0 +1,99 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //*************************************************************************
 // DESCRIPTION: Verilator: Threading's logic to mtask partitioner
 //
 // Code available from: http://www.veripool.org/verilator
 //
 //*************************************************************************
 //
 // Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
 // redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License
 // Version 2.0.
 //
 // Verilator is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 //*************************************************************************
 #ifndef _V3PARTITION_H_
 #define _V3PARTITION_H_
 #include "config_build.h"
 #include "verilatedos.h"
 #include <list>
 #include "V3Graph.h"
 #include "V3OrderGraph.h"
 class LogicMTask;
 typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
 //*************************************************************************
 /// V3Partition takes the fine-grained logic graph from V3Order and
 /// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
 /// of which contains of set of the logic nodes from the fine-grained
 /// graph.
 class V3Partition {
    // MEMBERS
    V3Graph* m_fineDepsGraphp;  // Fine-grained dependency graph
 public:
    // CONSTRUCTORS
    explicit V3Partition(V3Graph* fineDepsGraphp)
        : m_fineDepsGraphp(fineDepsGraphp) {}
    ~V3Partition() {}
    // METHODS
    // Fill in the provided empty graph with AbstractLogicMTask's and their
    // interdependencies.
    void go(V3Graph* mtasksp);
    static void selfTest();
    // Print out a hash of the shape of graphp.  Only needed to debug the
    // origin of some nondeterminism; otherwise this is pretty useless.
    static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
    // Print debug stats about graphp whose nodes must be AbstractMTask's.
    static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
    // Operate on the final ExecMTask graph, immediately prior to code
    // generation time.
    static void finalize();
 private:
    static void finalizeCosts(V3Graph* execMTaskGraphp);
    static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
    VL_DEBUG_FUNC;  // Declare debug()
    VL_UNCOPYABLE(V3Partition);
 };
 //*************************************************************************
 // Map a pointer into a id, for e.g. nodep to mtask mappings
 class PartPtrIdMap {
 private:
    // TYPES
    typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
    // MEMBERS
    mutable vluint64_t m_nextId;
    mutable PtrMap m_id;
 public:
    // CONSTRUCTORS
    PartPtrIdMap() : m_nextId(0) {}
    // METHODS
    vluint64_t findId(const void* ptrp) const {
        PtrMap::iterator it = m_id.find(ptrp);
        if (it != m_id.end()) {
            return it->second;
        }
        m_id[ptrp] = m_nextId;
        return m_nextId++;
    }
 };
 #endif  // Guard
--- a/src/V3PartitionGraph.h
+++ b/src/V3PartitionGraph.h
@ -0,0 +1,108 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //*************************************************************************
 // DESCRIPTION: Verilator: Threading's graph structures
 //
 // Code available from: http://www.veripool.org/verilator
 //
 //*************************************************************************
 //
 // Copyright 2003-2018 by Wilson Snyder.  This program is free software; you can
 // redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License
 // Version 2.0.
 //
 // Verilator is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 //*************************************************************************
 #ifndef _V3PARTITIONGRAPH_H_
 #define _V3PARTITIONGRAPH_H_
 #include "config_build.h"
 #include "verilatedos.h"
 #include <list>
 #include "V3Graph.h"
 #include "V3OrderGraph.h"
 //*************************************************************************
 // MTasks and graph structures
 class AbstractMTask : public V3GraphVertex {
 public:
    AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
    virtual ~AbstractMTask() {}
    virtual uint32_t id() const = 0;
    virtual uint32_t cost() const = 0;
 };
 class AbstractLogicMTask : public AbstractMTask {
 public:
    // TYPES
    typedef std::list<MTaskMoveVertex*> VxList;
    // CONSTRUCTORS
    AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
    virtual ~AbstractLogicMTask() {}
    // METHODS
    // Set of logic vertices in this mtask. Order is not significant.
    virtual const VxList* vertexListp() const = 0;
    virtual uint32_t id() const = 0;  // Unique id of this mtask.
    virtual uint32_t cost() const = 0;
 };
 class ExecMTask : public AbstractMTask {
 private:
    AstMTaskBody*       m_bodyp;     // Task body
    uint32_t            m_id;        // Unique id of this mtask.
    uint32_t            m_priority;  // Predicted critical path from the start of
    // this mtask to the ends of the graph that are reachable from this
    // mtask. In abstract time units.
    uint32_t            m_cost;      // Predicted runtime of this mtask, in the same
    // abstract time units as priority().
    uint32_t            m_thread;    // Thread for static (pack_mtasks) scheduling,
    // or 0xffffffff if not yet assigned.
    const ExecMTask*    m_packNextp;  // Next for static (pack_mtasks) scheduling
    bool                m_threadRoot;  // Is root thread
    VL_UNCOPYABLE(ExecMTask);
 public:
    ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
        : AbstractMTask(graphp),
          m_bodyp(bodyp),
          m_id(id),
          m_priority(0),
          m_cost(0),
          m_thread(0xffffffff),
          m_packNextp(NULL),
          m_threadRoot(false) {}
    AstMTaskBody* bodyp() const { return m_bodyp; }
    virtual uint32_t id() const { return m_id; }
    uint32_t priority() const { return m_priority; }
    void priority(uint32_t pri) { m_priority = pri; }
    virtual uint32_t cost() const { return m_cost; }
    void cost(uint32_t cost) { m_cost = cost; }
    void thread(uint32_t thread) { m_thread = thread; }
    uint32_t thread() const { return m_thread; }
    void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
    const ExecMTask* packNextp() const { return m_packNextp; }
    bool threadRoot() const { return m_threadRoot; }
    void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
    string cFuncName() const {
        // If this MTask maps to a C function, this should be the name
        return string("__Vmtask")+"__"+cvtToStr(m_id);
    }
    string name() const { return string("mt")+cvtToStr(id()); }
    void dump(std::ostream& str) const {
        str <<name()<<"."<<((void*)this);
        if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
        if (thread() != 0xffffffff) str <<" th="<<thread();
        if (threadRoot()) str <<" [ROOT]";
        if (packNextp()) str <<" nx="<<packNextp()->name();
    }
 };
 inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
    rhs.dump(os); return os; }
 #endif  // Guard
--- a/src/V3Trace.cpp
+++ b/src/V3Trace.cpp
@ -182,6 +182,7 @@ private:
    AstNode*		m_chgSubParentp;// Which node has call to m_chgSubFuncp
    int			m_chgSubStmts;	// Statements under function being built
    AstVarScope*	m_activityVscp;	// Activity variable
    uint32_t            m_activityNumber;  // Count of fields in activity variable
    uint32_t		m_code;		// Trace ident code# being assigned
    V3Graph		m_graph;	// Var/CFunc tracking
    TraceActivityVertex* m_alwaysVtxp;	// "Always trace" vertex
@ -297,7 +298,7 @@ private:
    void assignActivity() {
 	// Select activity numbers and put into each CFunc vertex
-	uint32_t activityNumber = 1;	// Note 0 indicates "slow"
+        m_activityNumber = 1;  // Note 0 indicates "slow"
 	for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
 	    if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
 		if (!vvertexp->activityCodeValid()) {
@ -306,17 +307,39 @@ private:
 			// This makes us need less activityNumbers and so speeds up the fast path.
 			vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
 		    } else {
-			vvertexp->activityCode(activityNumber++);
+                        vvertexp->activityCode(m_activityNumber++);
 		    }
 		}
 	    }
 	}
-	// Insert global variable
+        AstVar* newvarp;
-	if (!activityNumber) activityNumber++;   // For simplicity, always create it
+        if (v3Global.opt.mtasks()) {
-	int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE;   // For tighter code; round to next 32 bit point.
+            // Create a vector of bytes, not bits, for the tracing vector,
-	AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
+            // so that we can set them atomically without locking.
-				      "__Vm_traceActivity", VFlagBitPacked(), activityBits);
+            //
            // TODO: It would be slightly faster to have a bit vector per
            // chain of packed MTasks, but we haven't packed the MTasks yet.
            // If we support fully threaded tracing in the future, it would
            // make sense to improve this at that time.
            AstNodeDType* newScalarDtp
                = new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
            v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
            AstNodeDType* newArrDtp = new AstUnpackArrayDType(
                m_chgFuncp->fileline(),
                newScalarDtp,
                new AstRange(m_chgFuncp->fileline(),
                             VNumRange(m_activityNumber-1, 0, false)));
            v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
            newvarp = new AstVar(m_chgFuncp->fileline(),
                                 AstVarType::MODULETEMP,
                                  "__Vm_traceActivity", newArrDtp);
        } else {
            // For tighter code; round to next 32 bit point.
            int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
            newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
                                 "__Vm_traceActivity", VFlagBitPacked(), activityBits);
        }
 	m_topModp->addStmtp(newvarp);
 	AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
 	m_highScopep->addVarp(newvscp);
@ -329,15 +352,23 @@ private:
 		    FileLine* fl = vvertexp->insertp()->fileline();
 		    uint32_t acode = vvertexp->activityCode();
 		    vvertexp->insertp()->addNextHere
-			(new AstAssign (fl,
+                        (new AstAssign(fl, selectActivity(fl, acode, true),
-					new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
+                                       new AstConst(fl, AstConst::LogicTrue())));
 						    acode, 1),
 					new AstConst (fl, AstConst::LogicTrue())));
 		}
 	    }
 	}
    }
    AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
        if (v3Global.opt.mtasks()) {
            return new AstArraySel(
                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
        } else {
            return new AstSel(
                flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
        }
    }
    AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
 	AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
 	funcp->slow(basep->slow());
@ -453,8 +484,7 @@ private:
 		    AstNode* condp = NULL;
 		    for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
 			uint32_t acode = *csit;
-			AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
+                        AstNode* selp = selectActivity(fl, acode, false);
 						    acode, 1);
 			if (condp) condp = new AstOr (fl, condp, selp);
 			else condp = selp;
 		    }
@ -473,11 +503,19 @@ private:
 	// Clear activity after tracing completes
 	FileLine* fl = m_chgFuncp->fileline();
-	AstNode* clrp = new AstAssign (fl,
+        if (v3Global.opt.mtasks()) {
-				       new AstVarRef(fl, m_activityVscp, true),
+            for (uint32_t i = 0; i < m_activityNumber; ++i) {
-				       new AstConst(fl, V3Number(fl, m_activityVscp->width())));
+                AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
-	m_fullFuncp->addFinalsp(clrp->cloneTree(true));
+                                              new AstConst(fl, AstConst::LogicFalse()));
-	m_chgFuncp->addFinalsp(clrp);
+                m_fullFuncp->addFinalsp(clrp->cloneTree(true));
                m_chgFuncp->addFinalsp(clrp);
            }
        } else {
            AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
                                          new AstConst(fl, V3Number(fl, m_activityVscp->width())));
            m_fullFuncp->addFinalsp(clrp->cloneTree(true));
            m_chgFuncp->addFinalsp(clrp);
        }
    }
    uint32_t assignDeclCode(AstTraceDecl* nodep) {
@ -699,6 +737,7 @@ public:
 	m_chgSubFuncp = NULL;
 	m_chgSubParentp = NULL;
 	m_chgSubStmts = 0;
        m_activityNumber = 0;
        m_code = 0;
        m_finding = false;
 	m_funcNum = 0;
--- a/src/Verilator.cpp
+++ b/src/Verilator.cpp
@ -73,6 +73,7 @@
 #include "V3Param.h"
 #include "V3Parse.h"
 #include "V3ParseSym.h"
 #include "V3Partition.h"
 #include "V3PreShell.h"
 #include "V3Premit.h"
 #include "V3Reloop.h"
@ -524,6 +525,14 @@ void process () {
 	V3EmitC::emitcSyms();
 	V3EmitC::emitcTrace();
    }
    if (!v3Global.opt.xmlOnly()
        && v3Global.opt.mtasks()) {
        // Finalize our MTask cost estimates and pack the mtasks into
        // threads. Must happen pre-EmitC which relies on the packing
        // order. Must happen post-V3LifePost which changes the relative
        // costs of mtasks.
        V3Partition::finalize();
    }
    if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
 	V3EmitC::emitc();
    }
@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
    VHashSha1::selfTest();
    AstBasicDTypeKwd::selfTest();
    V3Graph::selfTest();
-    V3TSP::selfTest();
+    if (v3Global.opt.debugSelfTest()) {
-    V3ScoreboardBase::selfTest();
+        V3TSP::selfTest();
        V3ScoreboardBase::selfTest();
        V3Partition::selfTest();
    }
    // Read first filename
    v3Global.readFiles();
--- a/test_regress/Makefile
+++ b/test_regress/Makefile
@ -44,7 +44,7 @@ endif
 .PHONY: test
 test:
-	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
+	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist
 ######################################################################
@ -61,6 +61,9 @@ nc:
 vlt:
 	$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
 vltmt:
 	$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
 ######################################################################
 random:
--- a/test_regress/driver.pl
+++ b/test_regress/driver.pl
@ -45,6 +45,7 @@ our %All_Scenarios
       nc    => ["simulator", "nc"],
       vcs   => ["simulator", "vcs"],
       vlt   => ["simulator", "vlt_all", "vlt"],
       vltmt => ["simulator", "vlt_all", "vltmt"],
    );
 #======================================================================
@ -104,6 +105,7 @@ if (! GetOptions (
          "ms!"         => sub { $opt_scenarios{ms} = $_[1]; },
          "nc!"         => sub { $opt_scenarios{nc} = $_[1]; },
          "vlt!"        => sub { $opt_scenarios{vlt} = $_[1]; },
          "vltmt!"      => sub { $opt_scenarios{vltmt} = $_[1]; },
          "vcs!"        => sub { $opt_scenarios{vcs} = $_[1]; },
          "<>"          => \&parameter,
    )) {
@ -322,6 +324,7 @@ sub new {
    $self->{scenario} ||= "ghdl" if $self->{ghdl};
    $self->{scenario} ||= "vcs" if $self->{vcs};
    $self->{scenario} ||= "vlt" if $self->{vlt};
    $self->{scenario} ||= "vltmt" if $self->{vltmt};
    $self->{scenario} ||= "nc" if $self->{nc};
    $self->{scenario} ||= "ms" if $self->{ms};
    $self->{scenario} ||= "iv" if $self->{iv};
@ -407,6 +410,7 @@ sub new {
 	ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
 	# Verilator
 	vlt => 0,
        vltmt => 0,
 	verilator_flags => ["-cc",
 			    "-Mdir $self->{obj_dir}",
 			    "-OD",  # As currently disabled unless -O3
@ -420,7 +424,7 @@ sub new {
 	%$self};
    bless $self, $class;
-    $self->{vlt_all} = $self->{vlt};  # Any Verilator scenario
+    $self->{vlt_all} = $self->{vlt} || $self->{vltmt};  # Any Verilator scenario
    $self->{VM_PREFIX} ||= "V".$self->{name};
    $self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
@ -593,6 +597,8 @@ sub compile_vlt_flags {
    unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
    unshift @verilator_flags, "--x-assign unique";  # More likely to be buggy
    unshift @verilator_flags, "--trace" if $opt_trace;
    unshift @verilator_flags, "--threads 3" if $param{vltmt};
    unshift @verilator_flags, "--debug-partition" if $param{vltmt};
    if (defined $opt_optimize) {
 	my $letters = "";
 	if ($opt_optimize =~ /[a-zA-Z]/) {
@ -746,6 +752,11 @@ sub compile {
 	    return 1;
 	}
        if ($self->{vltmt} && !$self->cfg_with_threaded) {
            $self->skip("Test requires Verilator configured with threads\n");
            return 1;
        }
 	if (!$param{fails} && $param{verilator_make_gcc}
 	    && $param{make_main}) {
 	    $self->_make_main();
@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.
 =item --vlt
-Run Verilator tests.  Default unless another scenario flag is provided.
+Run Verilator tests in single-threaded mode.  Default unless another scenario flag is provided.
 =item --vltmt
 Run Verilator tests in multithreaded mode.
 =back
--- a/test_regress/t/t_a_selftest.pl
+++ b/test_regress/t/t_a_selftest.pl
@ -0,0 +1,22 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2003 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 scenarios(vlt_all => 1);
 top_filename("t/t_EXAMPLE.v");
 compile(
    verilator_flags2 => ['--debug-self-test'],
    verilator_make_gcc => 0,
    make_top_shell => 0,
    make_main => 0,
    );
 ok(1);
 1;
--- a/test_regress/t/t_case_huge.pl
+++ b/test_regress/t/t_case_huge.pl
@ -15,7 +15,8 @@ compile(
 if ($Self->{vlt_all}) {
    file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
               ($Self->{vltmt} ? 0 : 8));
 }
 execute(
--- a/test_regress/t/t_dpi_threads.pl
+++ b/test_regress/t/t_dpi_threads.pl
@ -0,0 +1,21 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2018 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 scenarios(vltmt => 1);
 compile(
    v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
    );
 execute(
    check_finished => 1,
    );
 ok(1);
 1;
--- a/test_regress/t/t_dpi_threads.v
+++ b/test_regress/t/t_dpi_threads.v
@ -0,0 +1,62 @@
 // DESCRIPTION: Verilator: Verilog Test module
 //
 // Copyright 2018 by Wilson Snyder. This program is free software; you can
 // redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License
 // Version 2.0.
 import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
 import "DPI-C" dpii_failure = function int \$dpii_failure ();
 module t (clk);
   input clk;
   integer cyc;
   integer failure;
   initial cyc = 0;
 `ifndef verilator
   `error "Only Verilator supports PLI-ish DPI calls."
 `endif
   always @ (posedge clk) begin
      if (cyc == 2) begin
         failure = $dpii_failure();
         $write("* failure = %0d\n", failure);
         if (failure > 0) begin
            $stop;
         end
         $write("*-* All Finished *-*\n");
         $finish;
      end
      cyc <= cyc + 1;
   end
   // The purpose of this test is to confirm that the DPI-call serialization
   // code in V3Partition does ensure that these DPI calls do not run
   // concurrently.
   //
   // Alternatively, the test may be run with "--threads-dpi all" in which case
   // it should confirm that the calls do run concurrently and do detect a
   // collision (they should, if the test is set up right.)  This is
   // t_dpi_threads_collide.pl.
   //
   // Q) Is it a risk that the partitioner will merge or serialize these always
   //    blocks, just by luck, even if the DPI-call serialization code fails?
   //
   // A) Yes, that's why t_dpi_threads_collide.pl also passes
   //    --no-threads-do-coaren to disable MTask coarsening.  This ensures that
   //    the MTask graph at the end of FixDataHazards (where we resolve DPI
   //    hazards) is basically the final MTasks graph, and that data hazards
   //    which persist beyond FixDataHazards should persist in the final
   //    generated C code.
   always @ (posedge clk) begin
      $dpii_sys();
   end
   always @ (posedge clk) begin
      $dpii_sys();
   end
 endmodule
--- a/test_regress/t/t_dpi_threads_c.cpp
+++ b/test_regress/t/t_dpi_threads_c.cpp
@ -0,0 +1,78 @@
 // -*- mode: C++; c-file-style: "cc-mode" -*-
 //*************************************************************************
 //
 // Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
 // redistribute it and/or modify it under the terms of either the GNU
 // Lesser General Public License Version 3 or the Perl Artistic License.
 // Version 2.0.
 //
 // Verilator is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 //*************************************************************************
 #include <atomic>
 #include <cstdio>
 #include <iostream>
 #include <unistd.h>
 #include "svdpi.h"
 //======================================================================
 #if defined(VERILATOR)
 # ifdef T_DPI_THREADS_COLLIDE
 #  include "Vt_dpi_threads_collide__Dpi.h"
 # else
 #  include "Vt_dpi_threads__Dpi.h"
 # endif
 #elif defined(VCS)
 # include "../vc_hdrs.h"
 #elif defined(CADENCE)
 # define NEED_EXTERNS
 #else
 # error "Unknown simulator for DPI test"
 #endif
 #ifdef NEED_EXTERNS
 extern "C" {
    extern void dpii_sys_task();
    extern int dpii_failure();
 }
 #endif
 //======================================================================
 struct state {
    std::atomic<bool> task_is_running;
    std::atomic<int> failure;
    state() : task_is_running(false)
            , failure(false) {}
 };
 static state st;
 void dpii_sys_task() {
    bool other_task_running = atomic_exchange(&st.task_is_running, true);
    if (other_task_running) {
        // Another task is running. This is a collision.
        st.failure = 1;
        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
    } else {
        std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
    }
    // Spend some time in the DPI call, so that if we can have a collision
    // we probably will. Technically this is not guaranteed to detect every
    // race. However, one second is so much greater than the expected
    // runtime of everything else in the test, it really should pick up on
    // races just about all of the time.
    sleep(1);
    atomic_exchange(&st.task_is_running, false);
 }
 int dpii_failure() {
    return st.failure;
 }
--- a/test_regress/t/t_dpi_threads_collide.pl
+++ b/test_regress/t/t_dpi_threads_collide.pl
@ -0,0 +1,28 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2018 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 scenarios(vltmt => 1);
 top_filename("t/t_dpi_threads.v");
 compile(
    v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
    );
 # Similar to t_dpi_threads, which confirms that Verilator can prevent a
 # race between DPI import calls, this test confirms that the race exists
 # and that the DPI C code can detect it under --threads-dpi all
 # mode.
 #
 execute(
    fails => 1,
    );
 ok(1);
 1;
--- a/test_regress/t/t_emit_memb_limit.pl
+++ b/test_regress/t/t_emit_memb_limit.pl
@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
 compile(
    verilator_flags2=>["-x-assign fast --x-initial fast",
                       "-Wno-UNOPTTHREADS",
-    ],
+                       # The slow V3Partition asserts are just too slow
                       # in this test. They're disabled just for performance
                       # reasons:
                       "--no-debug-partition"],
    );
 execute(
--- a/test_regress/t/t_gantt.pl
+++ b/test_regress/t/t_gantt.pl
@ -0,0 +1,74 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2003 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 use IO::File;
 # Test for bin/verilator_gantt,
 #
 # Only needed in multithreaded regression.
 scenarios(vltmt => 1);
 # It doesn't really matter what test
 # we use, so long as it runs several cycles,
 # enough for the profiling to happen:
 top_filename("t/t_gen_alw.v");
 compile(
    v_flags2 => ["--prof-threads"]
    );
 execute(
    all_run_flags => ["+verilator+prof+threads+start+2",
                      " +verilator+prof+threads+window+2",
                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
                      ],
    check_finished => 1,
    );
 # For now, verilator_gantt still reads from STDIN
 #  (probably it should take a file, gantt.dat like verilator_profcfunc)
 # The profiling data still goes direct to the runtime's STDOUT
 #  (maybe that should go to a separate file - gantt.dat?)
 run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
            "$Self->{obj_dir}/profile_threads.dat",
            "--vcd $Self->{obj_dir}/profile_threads.vcd",
            "> $Self->{obj_dir}/gantt.log"]);
 # We should have three lines of gantt chart, each with
 # an even number of mtask-bars (eg "[123--]")
 my $gantt_line_ct = 0;
 my $global_mtask_ct = 0;
 {
    my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
        or error("$! $Self->{obj_dir}/gantt.log");
    while (my $line = ($fh && $fh->getline)) {
        if ($line !~ m/^  t:/) { next; }
        $gantt_line_ct++;
        my $this_thread_mtask_ct = 0;
        my @mtasks = split(/\[/, $line);
        shift @mtasks; # throw the '>>  ' away
        foreach my $mtask (@mtasks) {
            # Format of each mtask is "[123--]" where the hyphens
            # number or ] may or may not appear; it depends on exact timing.
            $this_thread_mtask_ct++;
            $global_mtask_ct++;
        }
        if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
    }
 }
 if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
 if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
 print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
    if $Self->{verbose};
 # Diff to itself, just to check parsing
 vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
 ok(1);
 1;
--- a/test_regress/t/t_gate_tree.pl
+++ b/test_regress/t/t_gate_tree.pl
@ -117,6 +117,10 @@ compile(
    );
 execute(
    all_run_flags => ["+verilator+prof+threads+start+100",
                      " +verilator+prof+threads+window+2",
                      " +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
                      ],
    check_finished => 1,
    );
--- a/test_regress/t/t_help.pl
+++ b/test_regress/t/t_help.pl
@ -13,6 +13,7 @@ foreach my $prog (
    "../bin/verilator",
    "../bin/verilator_coverage",
    "../bin/verilator_difftree",
    "../bin/verilator_gantt",
    "../bin/verilator_profcfunc",
    ) {
    run(fails => 1,
--- a/test_regress/t/t_inst_tree_inl0_pub1.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1.pl
@ -38,7 +38,8 @@ sub checkRelativeRefs {
 if ($Self->{vlt_all}) {
    # We expect to combine sequent functions across multiple instances of
    # l2, l3, l4, l5. If this number drops, please confirm this has not broken.
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
               ($Self->{vltmt} ? 84 : 52));
    # Expect absolute refs in CFuncs for t (top module) and l1 (because it
    # has only one instance)
--- a/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
+++ b/test_regress/t/t_inst_tree_inl0_pub1_norelcfuncs.pl
@ -18,7 +18,8 @@ compile(
 if ($Self->{vlt_all}) {
    # Fewer optimizations than t_inst_tree_inl0_pub1 which allows
    # relative CFuncs:
-    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
+    file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
               ($Self->{vltmt} ? 0 : 31));
    # Should not find any 'this->' except some 'this->__VlSymsp'
    my @files = `ls $Self->{obj_dir}/*.cpp`;
--- a/test_regress/t/t_threads_counter_1.pl
+++ b/test_regress/t/t_threads_counter_1.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
-scenarios(simulator => 1);
+scenarios(vltmt => 1);
 $Self->cfg_with_threaded or skip("No thread support");
 top_filename("t/t_threads_counter.v");
--- a/test_regress/t/t_threads_counter_2.pl
+++ b/test_regress/t/t_threads_counter_2.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
-scenarios(simulator => 1);
+scenarios(vltmt => 1);
 $Self->cfg_with_threaded or skip("No thread support");
 top_filename("t/t_threads_counter.v");
--- a/test_regress/t/t_threads_counter_4.pl
+++ b/test_regress/t/t_threads_counter_4.pl
@ -0,0 +1,23 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 scenarios(vltmt => 1);
 top_filename("t/t_threads_counter.v");
 compile(
    verilator_flags2 => ['--cc --threads 4'],
    );
 execute(
    check_finished => 1,
    );
 ok(1);
 1;
--- a/test_regress/t/t_threads_nondeterminism.pl
+++ b/test_regress/t/t_threads_nondeterminism.pl
@ -0,0 +1,25 @@
 #!/usr/bin/perl
 if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
 # DESCRIPTION: Verilator: Verilog Test driver/expect definition
 #
 # Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
 # redistribute it and/or modify it under the terms of either the GNU
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
 scenarios(vltmt => 1);
 top_filename("t/t_threads_counter.v");
 compile(
    verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
    );
 execute(
    check_finished => 1,
    );
 file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
 ok(1);
 1;
--- a/test_regress/t/t_verilated_all.pl
+++ b/test_regress/t/t_verilated_all.pl
@ -13,7 +13,12 @@ my $root = "..";
 compile(
    # Can't use --coverage and --savable together, so cheat and compile inline
-    verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
+    verilator_flags2 => ["--cc",
                         "--coverage-toggle --coverage-line --coverage-user",
                         "--trace --vpi ",
                         ($Self->cfg_with_threaded
                          ? "--threads 2 $root/include/verilated_threads.cpp" : ""),
                         "$root/include/verilated_save.cpp"],
    );
 execute(
@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {
 foreach my $file (sort keys %hit) {
    if (!$hit{$file}
-        && $file !~ /_sc/) {
+        && $file !~ /_sc/
        && ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
        error("Include file not covered by t_verilated_all test: ",$file);
    }
 }
--- a/test_regress/t/t_verilated_threaded.pl
+++ b/test_regress/t/t_verilated_threaded.pl
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
 # Lesser General Public License Version 3 or the Perl Artistic License
 # Version 2.0.
-scenarios(simulator => 1);
+scenarios(vltmt => 1);
 $Self->cfg_with_threaded or skip("No thread support");
 top_filename("t/t_verilated_all.v");