MAJOR: Add multithreaded model generation.
This commit is contained in:
parent
0070520edb
commit
ec8dbbffed
7
Changes
7
Changes
|
|
@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks!
|
||||||
* Verilator 4.000 devel
|
* Verilator 4.000 devel
|
||||||
|
|
||||||
** This is a major release. Any patches may require major rework to apply.
|
** This is a major release. Any patches may require major rework to apply.
|
||||||
|
[Thanks everyone]
|
||||||
|
|
||||||
|
** Add multithreaded model generation.
|
||||||
|
|
||||||
** Add runtime arguments.
|
** Add runtime arguments.
|
||||||
|
|
||||||
** Fix internals to be C++ null-pointer-check clean.
|
|
||||||
|
|
||||||
*** Better optimize large always block splitting, bug1244. [John Coiner]
|
*** Better optimize large always block splitting, bug1244. [John Coiner]
|
||||||
|
|
||||||
*** Add new reloop optimization for repetitive assignment compression.
|
*** Add new reloop optimization for repetitive assignment compression.
|
||||||
|
|
||||||
|
**** Fix internals to be C++ null-pointer-check clean.
|
||||||
|
|
||||||
**** Fix internals to avoid 'using namespace std'.
|
**** Fix internals to avoid 'using namespace std'.
|
||||||
|
|
||||||
**** Fix Verilation performance issues, bug1316. [John Coiner]
|
**** Fix Verilation performance issues, bug1316. [John Coiner]
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
|
||||||
bin/verilator \
|
bin/verilator \
|
||||||
bin/verilator_coverage \
|
bin/verilator_coverage \
|
||||||
bin/verilator_difftree \
|
bin/verilator_difftree \
|
||||||
|
bin/verilator_gantt \
|
||||||
bin/verilator_includer \
|
bin/verilator_includer \
|
||||||
bin/verilator_profcfunc \
|
bin/verilator_profcfunc \
|
||||||
doxygen-mainpage doxygen.config veripool-logo.png \
|
doxygen-mainpage doxygen.config veripool-logo.png \
|
||||||
|
|
@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \
|
||||||
INST_PROJ_FILES = \
|
INST_PROJ_FILES = \
|
||||||
bin/verilator \
|
bin/verilator \
|
||||||
bin/verilator_coverage \
|
bin/verilator_coverage \
|
||||||
|
bin/verilator_gantt \
|
||||||
bin/verilator_includer \
|
bin/verilator_includer \
|
||||||
bin/verilator_profcfunc \
|
bin/verilator_profcfunc \
|
||||||
include/verilated.mk \
|
include/verilated.mk \
|
||||||
|
|
@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile
|
||||||
|
|
||||||
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
||||||
VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
|
VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \
|
||||||
verilator_coverage verilator_includer verilator_profcfunc
|
verilator_coverage verilator_gantt verilator_includer verilator_profcfunc
|
||||||
# Some scripts go into both the search path and pkgdatadir,
|
# Some scripts go into both the search path and pkgdatadir,
|
||||||
# so they can be found by the user, and under $VERILATOR_ROOT.
|
# so they can be found by the user, and under $VERILATOR_ROOT.
|
||||||
|
|
||||||
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
# See uninstall also - don't put wildcards in this variable, it might uninstall other stuff
|
||||||
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1
|
VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1
|
||||||
|
|
||||||
VL_INST_INC_BLDDIR_FILES = \
|
VL_INST_INC_BLDDIR_FILES = \
|
||||||
include/verilated_config.h \
|
include/verilated_config.h \
|
||||||
|
|
@ -295,6 +297,7 @@ installbin:
|
||||||
$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
|
$(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir)
|
||||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator )
|
||||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage )
|
||||||
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt )
|
||||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc )
|
||||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin )
|
||||||
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
|
( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg )
|
||||||
|
|
|
||||||
187
bin/verilator
187
bin/verilator
|
|
@ -338,6 +338,7 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
|
||||||
--pipe-filter <command> Filter all input through a script
|
--pipe-filter <command> Filter all input through a script
|
||||||
--prefix <topname> Name of top level class
|
--prefix <topname> Name of top level class
|
||||||
--prof-cfuncs Name functions for profiling
|
--prof-cfuncs Name functions for profiling
|
||||||
|
--prof-threads Enable generating gantt chart data for threads
|
||||||
--private Debugging; see docs
|
--private Debugging; see docs
|
||||||
--public Debugging; see docs
|
--public Debugging; see docs
|
||||||
-pvalue+<name>=<value> Overwrite toplevel parameter
|
-pvalue+<name>=<value> Overwrite toplevel parameter
|
||||||
|
|
@ -350,6 +351,9 @@ detailed descriptions in L</"VERILATION ARGUMENTS"> for more information.
|
||||||
--stats-vars Provide statistics on variables
|
--stats-vars Provide statistics on variables
|
||||||
-sv Enable SystemVerilog parsing
|
-sv Enable SystemVerilog parsing
|
||||||
+systemverilogext+<ext> Synonym for +1800-2017ext+<ext>
|
+systemverilogext+<ext> Synonym for +1800-2017ext+<ext>
|
||||||
|
--threads <threads> Enable multithreading
|
||||||
|
--threads-dpi <mode> Enable multithreaded DPI
|
||||||
|
--threads-max-mtasks <mtasks> Tune maximum mtask partitioning
|
||||||
--top-module <topname> Name of top level input module
|
--top-module <topname> Name of top level input module
|
||||||
--trace Enable waveform creation
|
--trace Enable waveform creation
|
||||||
--trace-depth <levels> Depth of tracing
|
--trace-depth <levels> Depth of tracing
|
||||||
|
|
@ -386,6 +390,9 @@ detailed descriptions in L</"RUNTIME ARGUMENTS"> for more information.
|
||||||
+verilator+debug Enable debugging
|
+verilator+debug Enable debugging
|
||||||
+verilator+debugi+<value> Enable debugging at a level
|
+verilator+debugi+<value> Enable debugging at a level
|
||||||
+verilator+help Display help
|
+verilator+help Display help
|
||||||
|
+verilator+prof+threads+file+I<filename> Set profile filename
|
||||||
|
+verilator+prof+threads+start+I<value> Set profile starting point
|
||||||
|
+verilator+prof+threads+window+I<value> Set profile duration
|
||||||
+verilator+rand+reset+<value> Set random reset technique
|
+verilator+rand+reset+<value> Set random reset technique
|
||||||
+verilator+V Verbose version and config
|
+verilator+V Verbose version and config
|
||||||
+verilator+version Show version and exit
|
+verilator+version Show version and exit
|
||||||
|
|
@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from. This allows gprof
|
||||||
or oprofile reports to be correlated with the original Verilog source
|
or oprofile reports to be correlated with the original Verilog source
|
||||||
statements. See also L<verilator_profcfunc>.
|
statements. See also L<verilator_profcfunc>.
|
||||||
|
|
||||||
|
=item --prof-threads
|
||||||
|
|
||||||
|
Enable gantt chart data collection for threaded builds.
|
||||||
|
|
||||||
|
Verilator will record the start and end time of each macro-task across a
|
||||||
|
number of calls to eval. (What is a macro-task? See the Verilator internals
|
||||||
|
document.)
|
||||||
|
|
||||||
|
When profiling is enabled, the runtime will emit a blurb of profiling data
|
||||||
|
in non-human-friendly form. The C<verilator_gantt> script will transform
|
||||||
|
this into a nicer visual format and produce some related statistics.
|
||||||
|
|
||||||
=item --private
|
=item --private
|
||||||
|
|
||||||
Opposite of --public. Is the default; this option exists for backwards
|
Opposite of --public. Is the default; this option exists for backwards
|
||||||
|
|
@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model.
|
||||||
|
|
||||||
The user code must create a VerilatedSerialize or VerilatedDeserialze
|
The user code must create a VerilatedSerialize or VerilatedDeserialze
|
||||||
object then calling the << or >> operators on the generated model and any
|
object then calling the << or >> operators on the generated model and any
|
||||||
other data the process needs saved/restored. For example:
|
other data the process needs saved/restored. These functions are not
|
||||||
|
thread safe, and are typically called only by a main thread.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
void save_model(const char* filenamep) {
|
void save_model(const char* filenamep) {
|
||||||
VerilatedSave os;
|
VerilatedSave os;
|
||||||
|
|
@ -1173,6 +1195,42 @@ compatibility with other simulators.
|
||||||
|
|
||||||
A synonym for C<+1800-2017ext+>I<ext>.
|
A synonym for C<+1800-2017ext+>I<ext>.
|
||||||
|
|
||||||
|
=item --threads I<threads>
|
||||||
|
|
||||||
|
=item --no-threads
|
||||||
|
|
||||||
|
With --threads 0 or --no-threads, the default, the generated model is not
|
||||||
|
thread safe. With --threads 1, the generated model is single threaded but
|
||||||
|
may run in a multithreaded environment. With --threads N, where N >= 2, the
|
||||||
|
model is generated to run multithreaded on up to N threads. See
|
||||||
|
L</"MULTITHREADING">.
|
||||||
|
|
||||||
|
=item --threads-dpi all
|
||||||
|
|
||||||
|
=item --threads-dpi none
|
||||||
|
|
||||||
|
=item --threads-dpi pure
|
||||||
|
|
||||||
|
When using --dpi with --threads, control what DPI tasks are thread safe.
|
||||||
|
|
||||||
|
With --threads-dpi all, enable Verilator to assume all DPI imports are
|
||||||
|
threadsafe, and to use thread-local storage for communication with DPI,
|
||||||
|
potentially improving performance. Any DPI libraries need appropriate
|
||||||
|
mutexes to avoid undefined behavior.
|
||||||
|
|
||||||
|
With --threads-dpi none, Verilator assume DPI imports are not thread safe,
|
||||||
|
and Verilator will serialize calls to DPI imports by default, potentially
|
||||||
|
harming performance.
|
||||||
|
|
||||||
|
With --threads-dpi pure, the default, Verilator assumes DPI pure imports
|
||||||
|
are threadsafe, but non-pure DPI imports are not.
|
||||||
|
|
||||||
|
=item --threads-max-mtasks I<value>
|
||||||
|
|
||||||
|
Rarely needed. When using --threads, specify the number of mtasks the
|
||||||
|
model is to be partitioned into. If unspecified, Verilator approximates a
|
||||||
|
good value.
|
||||||
|
|
||||||
=item --top-module I<topname>
|
=item --top-module I<topname>
|
||||||
|
|
||||||
When the input Verilog contains more than one top level module, specifies
|
When the input Verilog contains more than one top level module, specifies
|
||||||
|
|
@ -1464,6 +1522,28 @@ Enable debugging at the provided level.
|
||||||
|
|
||||||
Display help and exit.
|
Display help and exit.
|
||||||
|
|
||||||
|
=item +verilator+prof+threads+file+I<filename>
|
||||||
|
|
||||||
|
When using --prof-threads, the filename to dump to. Defaults to
|
||||||
|
"profile_threads.dat".
|
||||||
|
|
||||||
|
=item +verilator+prof+threads+start+I<value>
|
||||||
|
|
||||||
|
When using --prof-threads, Verilator will wait until $time is at this
|
||||||
|
value, then start the profiling warmup, then capturing. Generally this
|
||||||
|
should be set to some time that is well within the normal operation of the
|
||||||
|
simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to
|
||||||
|
1.
|
||||||
|
|
||||||
|
=item +verilator+prof+threads+window+I<value>
|
||||||
|
|
||||||
|
When using --prof-threads, after $time reaches
|
||||||
|
+verilator+prof+threads+start, Verilator will warm up the profiling for
|
||||||
|
this number of eval() calls, then will capture the profiling of this number
|
||||||
|
of eval() calls. Defaults to 2, which makes sense for a
|
||||||
|
single-clock-domain module where it's typical to want to capture one
|
||||||
|
posedge eval() and one negedge eval().
|
||||||
|
|
||||||
=item +verilator+rand+reset+I<value>
|
=item +verilator+rand+reset+I<value>
|
||||||
|
|
||||||
When a model was Verilated using "-x-inital unique", sets the
|
When a model was Verilated using "-x-inital unique", sets the
|
||||||
|
|
@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk
|
||||||
of reset bugs in trade for performance; see the above documentation for
|
of reset bugs in trade for performance; see the above documentation for
|
||||||
these flags.
|
these flags.
|
||||||
|
|
||||||
|
If using Verilated multithreaded, use C<numactl> to ensure you are using
|
||||||
|
non-conflicting hardware resources. See L</"MULTITHREADING">.
|
||||||
|
|
||||||
Minor Verilog code changes can also give big wins. You should not have any
|
Minor Verilog code changes can also give big wins. You should not have any
|
||||||
UNOPTFLAT warnings from Verilator. Fixing these warnings can result in
|
UNOPTFLAT warnings from Verilator. Fixing these warnings can result in
|
||||||
huge improvements; one user fixed their one UNOPTFLAT warning by making a
|
huge improvements; one user fixed their one UNOPTFLAT warning by making a
|
||||||
|
|
@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated
|
||||||
in obj_dir/Vour_classes.mk.
|
in obj_dir/Vour_classes.mk.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 MULTITHREADING
|
||||||
|
|
||||||
|
Verilator experimentally supports multithreading.
|
||||||
|
|
||||||
|
With --no-threads, the default, the model is not thread safe, and any use
|
||||||
|
of more than one thread calling into one or even different Verilated models
|
||||||
|
may result in unpredictable behavior. This gives the highest single thread
|
||||||
|
performance.
|
||||||
|
|
||||||
|
With --threads 1, the generated model is single threaded, however the
|
||||||
|
support libraries are multithread safe. This allows different
|
||||||
|
instantiations of model(s) to potentially each be run under a different
|
||||||
|
thread. All threading is the responsibility of the user's C++ testbench.
|
||||||
|
|
||||||
|
With --threads N, where N is at least 2, the generated model will be
|
||||||
|
designed to run in parallel on N threads. The thread calling eval()
|
||||||
|
provides one of those threads, and the generated model will create and
|
||||||
|
manage the other N-1 threads. It's the client's responsibility not to
|
||||||
|
oversubscribe the available CPU cores. Under CPU oversubscription, the
|
||||||
|
Verilated model should not livelock nor deadlock, however, you can expect
|
||||||
|
performance to be far worse than it would be with proper stoichiometry of
|
||||||
|
threads and CPU cores.
|
||||||
|
|
||||||
|
The remainder of this section describe behavior with --threads 1 or
|
||||||
|
--threads N (not --no-threads).
|
||||||
|
|
||||||
|
VL_THREADED is defined when compiling a threaded Verilated module, causing
|
||||||
|
the Verilated support classes become threadsafe.
|
||||||
|
|
||||||
|
The thread used for constructing a model must the the same thread that
|
||||||
|
calls eval() into the model, this is called the "eval thread". The thread
|
||||||
|
used to perform certain global operations such as saving and tracing must
|
||||||
|
be done by a "main thread". In most cases the eval thread and main thread
|
||||||
|
are the same thread (i.e. the user's top C++ testbench runs on a single
|
||||||
|
thread), but this is not required.
|
||||||
|
|
||||||
|
When running a multithreaded model, the default Linux task scheduler often
|
||||||
|
works against the model, by assuming threads are short lived, and thus
|
||||||
|
often schedules threads using multiple hyperthreads within the same
|
||||||
|
physical core. For best performance use the C<numactl> program to (when the
|
||||||
|
threading count fits) select unique physical cores on the same socket. For
|
||||||
|
example, if a model was Verilated with "--threads 4", we consult
|
||||||
|
|
||||||
|
egrep 'processor|physical id|core id' /proc/cpuinfo
|
||||||
|
|
||||||
|
To select cores 0, 1, 2, and 3 that are all located on the same socket (0)
|
||||||
|
but different physical cores. (Also useful is "numactl --hardware", or
|
||||||
|
C<lscpu> but those doesn't show Hyperthreading cores.) Then we execute
|
||||||
|
|
||||||
|
numactl -m 0 -C 0,1,2,3 -- verilated_executable_name
|
||||||
|
|
||||||
|
This will limit memory to socket 0, and threads to cores 0, 1, 2, 3,
|
||||||
|
(presumably on socket 0) optimizing performance. Of course this must be
|
||||||
|
adjusted if you want another simulator using e.g. socket 1, or if you
|
||||||
|
Verilated with a different number of threads. To see what CPUs are
|
||||||
|
actually used, use --prof-threads.
|
||||||
|
|
||||||
|
=head2 Multithreaded Verilog and Library Support
|
||||||
|
|
||||||
|
$display/$stop/$finish are delayed until the end of an eval() call in order
|
||||||
|
to maintain ordering between threads. This may result in additional tasks
|
||||||
|
completing after the $stop or $finish.
|
||||||
|
|
||||||
|
If using --coverage, the coverage routines are fully thread safe.
|
||||||
|
|
||||||
|
If using --dpi, Verilator assumes pure DPI imports are thread safe,
|
||||||
|
balancing performance versus saftey. See --threads-dpi.
|
||||||
|
|
||||||
|
If using --savable, the save/restore classes are not multithreaded and are
|
||||||
|
must be called only by the eval thread.
|
||||||
|
|
||||||
|
If using --sc, the SystemC kernel is not thread safe, therefore the eval
|
||||||
|
thread and main thread must be the same.
|
||||||
|
|
||||||
|
If using --trace, the tracing classes must be constructed and called from
|
||||||
|
the main thread.
|
||||||
|
|
||||||
|
If using --vpi, since SystemVerilog VPI was not architected by IEEE to be
|
||||||
|
multithreaded, Verilator requires all VPI calls are only made from the main
|
||||||
|
thread.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
=head1 CONFIGURATION FILES
|
=head1 CONFIGURATION FILES
|
||||||
|
|
||||||
In addition to the command line, warnings and other features may be
|
In addition to the command line, warnings and other features may be
|
||||||
|
|
@ -3636,6 +3802,21 @@ section for more details.
|
||||||
Ignoring this warning will only slow simulations, it will simulate
|
Ignoring this warning will only slow simulations, it will simulate
|
||||||
correctly.
|
correctly.
|
||||||
|
|
||||||
|
=item UNOPTTHREADS
|
||||||
|
|
||||||
|
Warns that the thread scheduler was unable to partition the design to fill
|
||||||
|
the requested number of threads.
|
||||||
|
|
||||||
|
One workaround is to request fewer threads with C<--threads>.
|
||||||
|
|
||||||
|
Another possible workaround is to allow more MTasks in the runtime, by
|
||||||
|
increasing the value of --threads-max-mtasks. More MTasks will result in
|
||||||
|
more communication and synchronization overhead at runtime; the scheduler
|
||||||
|
attempts to minimize the number of MTasks for this reason.
|
||||||
|
|
||||||
|
Ignoring this warning will only slow simulations, it will simulate
|
||||||
|
correctly.
|
||||||
|
|
||||||
=item UNPACKED
|
=item UNPACKED
|
||||||
|
|
||||||
Warns that unpacked structs and unions are not supported.
|
Warns that unpacked structs and unions are not supported.
|
||||||
|
|
@ -4185,6 +4366,8 @@ performance gain.
|
||||||
|
|
||||||
In 2009, major SystemVerilog and DPI language support was added.
|
In 2009, major SystemVerilog and DPI language support was added.
|
||||||
|
|
||||||
|
In 2018, Verilator 4.000 was released with multithreaded support.
|
||||||
|
|
||||||
Currently, various language features and performance enhancements are added
|
Currently, various language features and performance enhancements are added
|
||||||
as the need arises. Verilator is now about 3x faster than in 2002, and is
|
as the need arises. Verilator is now about 3x faster than in 2002, and is
|
||||||
faster than many popular commercial simulators.
|
faster than many popular commercial simulators.
|
||||||
|
|
@ -4282,7 +4465,7 @@ License Version 2.0.
|
||||||
|
|
||||||
=head1 SEE ALSO
|
=head1 SEE ALSO
|
||||||
|
|
||||||
L<verilator_coverage>, L<verilator_profcfunc>, L<make>,
|
L<verilator_coverage>, L<verilator_gantt>, L<verilator_profcfunc>, L<make>,
|
||||||
|
|
||||||
L<verilator --help> which is the source for this document,
|
L<verilator --help> which is the source for this document,
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,559 @@
|
||||||
|
: # -*-Mode: perl;-*- use perl, wherever it is
|
||||||
|
eval 'exec perl -wS $0 ${1+"$@"}'
|
||||||
|
if 0;
|
||||||
|
# See copyright, etc in below POD section.
|
||||||
|
######################################################################
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use Getopt::Long;
|
||||||
|
use Pod::Usage;
|
||||||
|
use vars qw ($Debug);
|
||||||
|
|
||||||
|
$Debug = 0;
|
||||||
|
my $Opt_File;
|
||||||
|
my $Opt_Time_Per_Char = 0; # rdtsc ticks per char in gantt chart, 0=auto
|
||||||
|
my $opt_vcd = "profile_threads.vcd";
|
||||||
|
|
||||||
|
our %Threads;
|
||||||
|
our %Mtasks;
|
||||||
|
our %Global;
|
||||||
|
|
||||||
|
autoflush STDOUT 1;
|
||||||
|
autoflush STDERR 1;
|
||||||
|
Getopt::Long::config ("no_auto_abbrev");
|
||||||
|
if (! GetOptions (
|
||||||
|
"help" => \&usage,
|
||||||
|
"scale=i" => \$Opt_Time_Per_Char,
|
||||||
|
"debug" => sub { $Debug = 1; },
|
||||||
|
"vcd=s" => \$opt_vcd,
|
||||||
|
"no-vcd!" => sub { $opt_vcd = undef; },
|
||||||
|
"<>" => \¶meter,
|
||||||
|
)) {
|
||||||
|
die "%Error: Bad usage, try 'verilator_gantt --help'\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
$Opt_File = "profile_threads.dat" if !defined $Opt_File;
|
||||||
|
|
||||||
|
process($Opt_File);
|
||||||
|
write_vcd($opt_vcd) if defined $opt_vcd;
|
||||||
|
exit(0);
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
sub usage {
|
||||||
|
pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT);
|
||||||
|
exit (1);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub parameter {
|
||||||
|
my $param = shift;
|
||||||
|
if (!defined $Opt_File) {
|
||||||
|
$Opt_File = $param;
|
||||||
|
} else {
|
||||||
|
die "%Error: Unknown parameter: $param\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
sub process {
|
||||||
|
my $filename = shift;
|
||||||
|
|
||||||
|
read_data($filename);
|
||||||
|
report();
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
sub read_data {
|
||||||
|
my $filename = shift;
|
||||||
|
|
||||||
|
%Global = (rdtsc_cycle_time => 0);
|
||||||
|
|
||||||
|
my $fh = IO::File->new ($filename) or die "%Error: $! $filename,";
|
||||||
|
while (my $line = $fh->getline) {
|
||||||
|
if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) {
|
||||||
|
my $mtask = $1;
|
||||||
|
my $start = $2;
|
||||||
|
my $end = $3;
|
||||||
|
my $elapsed_time = $4;
|
||||||
|
my $predict_time = $5;
|
||||||
|
my $cpu = $6;
|
||||||
|
my $thread = $7;
|
||||||
|
$Threads{$thread}{$start}{mtask} = $mtask;
|
||||||
|
$Threads{$thread}{$start}{end} = $end;
|
||||||
|
$Threads{$thread}{$start}{cpu} = $cpu;
|
||||||
|
|
||||||
|
if (!exists $Mtasks{$mtask}{elapsed}) {
|
||||||
|
$Mtasks{$mtask}{elapsed} = 0;
|
||||||
|
}
|
||||||
|
$Mtasks{$mtask}{elapsed} += $elapsed_time;
|
||||||
|
$Mtasks{$mtask}{predict} = $predict_time;
|
||||||
|
$Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end);
|
||||||
|
}
|
||||||
|
elsif ($line =~ /^VLPROFTHREAD/) {}
|
||||||
|
elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/
|
||||||
|
|| $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) {
|
||||||
|
$Global{args}{$1} = $2;
|
||||||
|
}
|
||||||
|
elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) {
|
||||||
|
$Global{stats}{$1} = $2;
|
||||||
|
}
|
||||||
|
elsif ($line =~ /^#/) {}
|
||||||
|
elsif ($Debug) {
|
||||||
|
chomp $line;
|
||||||
|
print "Unk: $line\n";
|
||||||
|
}
|
||||||
|
# TODO -- this is parsing text printed by a client.
|
||||||
|
# Really, verilator proper should generate this
|
||||||
|
# if it's useful...
|
||||||
|
if ($line =~ m/rdtsc time = (\d+) ticks/) {
|
||||||
|
$Global{rdtsc_cycle_time} = $1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub report {
|
||||||
|
print "Verilator Gantt report\n";
|
||||||
|
|
||||||
|
print "\nArgument settings:\n";
|
||||||
|
foreach my $arg (sort keys %{$Global{args}}) {
|
||||||
|
my $plus = ($arg =~ /^\+/) ? "+" : " ";
|
||||||
|
printf " %s%s%d\n", $arg, $plus, $Global{args}{$arg};
|
||||||
|
}
|
||||||
|
|
||||||
|
my $nthreads = scalar keys %Threads;
|
||||||
|
$Global{cpus}{cpu_time} = {};
|
||||||
|
foreach my $thread (keys %Threads) {
|
||||||
|
# Make potentially multiple characters per column
|
||||||
|
foreach my $start (keys %{$Threads{$thread}}) {
|
||||||
|
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||||
|
my $elapsed = $Threads{$thread}{$start}{end} - $start;
|
||||||
|
$Global{cpus}{cpu_time}{$cpu} += $elapsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
my $mt_mtask_time = 0;
|
||||||
|
my $long_mtask_time = 0;
|
||||||
|
my $last_end = 0;
|
||||||
|
foreach my $mtask (keys %Mtasks) {
|
||||||
|
$mt_mtask_time += $Mtasks{$mtask}{elapsed};
|
||||||
|
$last_end = max($last_end, $Mtasks{$mtask}{end});
|
||||||
|
$long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed});
|
||||||
|
}
|
||||||
|
$Global{last_end} = $last_end;
|
||||||
|
|
||||||
|
report_graph();
|
||||||
|
|
||||||
|
# If we know cycle time in the same (rdtsc) units,
|
||||||
|
# this will give us an actual utilization number,
|
||||||
|
# (how effectively we keep the cores busy.)
|
||||||
|
#
|
||||||
|
# It also gives us a number we can compare against
|
||||||
|
# serial mode, to estimate the overhead of data sharing,
|
||||||
|
# which will show up in the total elapsed time. (Overhead
|
||||||
|
# of synchronization and scheduling should not.)
|
||||||
|
print "\nAnalysis:\n";
|
||||||
|
printf " Total threads = %d\n", $nthreads;
|
||||||
|
printf " Total mtasks = %d\n", scalar (keys %Mtasks);
|
||||||
|
printf " Total cpus used = %d\n", scalar (keys %{$Global{cpus}});
|
||||||
|
printf " Total yields = %d\n", $Global{stats}{yields};
|
||||||
|
printf " Total eval time = %d rdtsc ticks\n", $Global{last_end};
|
||||||
|
printf " Longest mtask time = %d rdtsc ticks\n", $long_mtask_time;
|
||||||
|
printf " All-thread mtask time = %d rdtsc ticks\n", $mt_mtask_time;
|
||||||
|
my $long_efficiency = $long_mtask_time/($Global{last_end});
|
||||||
|
printf " Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100;
|
||||||
|
my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads);
|
||||||
|
printf " All-thread efficiency = %0.1f%%\n", $mt_efficiency*100;
|
||||||
|
printf " All-thread speedup = %0.1f\n", $mt_efficiency*$nthreads;
|
||||||
|
if ($Global{rdtsc_cycle_time} > 0) {
|
||||||
|
my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time};
|
||||||
|
print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
my @p2e_ratios;
|
||||||
|
my $min_p2e = 1000000;
|
||||||
|
my $min_mtask;
|
||||||
|
my $max_p2e = -1000000;
|
||||||
|
my $max_mtask;
|
||||||
|
foreach my $mtask (sort keys %Mtasks) {
|
||||||
|
if ($Mtasks{$mtask}{elapsed} > 0) {
|
||||||
|
if ($Mtasks{$mtask}{predict} == 0) {
|
||||||
|
$Mtasks{$mtask}{predict} = 1; # don't log(0) below
|
||||||
|
}
|
||||||
|
my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} );
|
||||||
|
#print "log(p2e $mtask) = $p2e_ratio (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n";
|
||||||
|
push @p2e_ratios, $p2e_ratio;
|
||||||
|
|
||||||
|
if ($p2e_ratio > $max_p2e) {
|
||||||
|
$max_p2e = $p2e_ratio;
|
||||||
|
$max_mtask = $mtask;
|
||||||
|
}
|
||||||
|
if ($p2e_ratio < $min_p2e) {
|
||||||
|
$min_p2e = $p2e_ratio;
|
||||||
|
$min_mtask = $mtask;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print "\nStatistics:\n";
|
||||||
|
print " min log(p2e) = $min_p2e from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n";
|
||||||
|
print " max log(p2e) = $max_p2e from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n";
|
||||||
|
|
||||||
|
my $stddev = stddev(\@p2e_ratios);
|
||||||
|
my $mean = mean(\@p2e_ratios);
|
||||||
|
print " mean = " . ($mean) . "\n";
|
||||||
|
print " stddev = " . ($stddev) . "\n";
|
||||||
|
print " e ^ stddev = " . exp($stddev). "\n";
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub report_graph {
|
||||||
|
my $time_per = $Opt_Time_Per_Char;
|
||||||
|
if ($time_per == 0) {
|
||||||
|
$time_per = ($Global{last_end} / 40); # Start with 40 columns
|
||||||
|
while ($time_per > 10) {
|
||||||
|
my ($graph, $conflicts) = _make_graph($time_per);
|
||||||
|
last if !$conflicts;
|
||||||
|
$time_per = int($time_per/2);
|
||||||
|
}
|
||||||
|
# One more step so we can fit more labels
|
||||||
|
$time_per = int($time_per/2);
|
||||||
|
}
|
||||||
|
|
||||||
|
my ($graph, $conflicts) = _make_graph($time_per);
|
||||||
|
|
||||||
|
print "\nThread gantt graph:\n";
|
||||||
|
print " Legend: One character width = $time_per rdtsc ticks\n";
|
||||||
|
print " Legend: '&' = multiple mtasks in this period (character width)\n";
|
||||||
|
|
||||||
|
my $scale = " <-".$Global{last_end}." rdtsc total";
|
||||||
|
for (my $col = length($scale); # -2 for '->' below
|
||||||
|
$col < ($Global{last_end}/$time_per); ++$col) {
|
||||||
|
$scale .= "-";
|
||||||
|
}
|
||||||
|
print " $scale->\n";
|
||||||
|
|
||||||
|
foreach my $thread (sort keys %{$graph}) {
|
||||||
|
print " t: ";
|
||||||
|
_print_graph_line($graph->{$thread}, '');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub _make_graph {
|
||||||
|
my $time_per = shift;
|
||||||
|
|
||||||
|
my $graph = {}; # {thread}{column}{char=>'x' or chars=>#}
|
||||||
|
my $conflicts = 0;
|
||||||
|
foreach my $thread (keys %Threads) {
|
||||||
|
# Make potentially multiple characters per column
|
||||||
|
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
|
||||||
|
my $end = $Threads{$thread}{$start}{end};
|
||||||
|
my $mtask = $Threads{$thread}{$start}{mtask};
|
||||||
|
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||||
|
|
||||||
|
my $startcol = _time_col($time_per, $start);
|
||||||
|
my $endcol = _time_col($time_per, $end);
|
||||||
|
|
||||||
|
my $label = "[";
|
||||||
|
$label .= "$cpu"; # Maybe make optional in future
|
||||||
|
my $width = $endcol - $startcol + 1;
|
||||||
|
while (length($label) < ($width-1)) { # -1 for ']'
|
||||||
|
$label .= "-";
|
||||||
|
}
|
||||||
|
$label .= "]";
|
||||||
|
$graph->{$thread}[$startcol]{char} .= $label;
|
||||||
|
}
|
||||||
|
if ($Debug) {
|
||||||
|
print "# Multicol: "; _print_graph_line($graph->{$thread}, '|');
|
||||||
|
}
|
||||||
|
# Expand line to one char per column
|
||||||
|
for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) {
|
||||||
|
if (my $chars = $graph->{$thread}[$col]{char}) {
|
||||||
|
my $ok = 1;
|
||||||
|
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
|
||||||
|
if ($graph->{$thread}[$col + $coladd]{char}) {
|
||||||
|
$ok = 0; last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!$ok) {
|
||||||
|
if ($chars =~ /\[.*\[/) { # Two begins or more
|
||||||
|
$conflicts++;
|
||||||
|
$graph->{$thread}[$col]{char} = "&";
|
||||||
|
} else {
|
||||||
|
$graph->{$thread}[$col]{char} = "[";
|
||||||
|
}
|
||||||
|
for (my $coladd = 1; $coladd<length($chars); ++$coladd) {
|
||||||
|
if ($graph->{$thread}[$col + $coladd]{char}) {
|
||||||
|
last;
|
||||||
|
} else {
|
||||||
|
$graph->{$thread}[$col + $coladd]{char} = 'x';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
my $coladd = 0;
|
||||||
|
foreach my $char (split //, $chars) {
|
||||||
|
$graph->{$thread}[$col+$coladd]{char} = $char;
|
||||||
|
++$coladd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($Debug) {
|
||||||
|
print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print "# Conflicts $conflicts\n" if $Debug;
|
||||||
|
return ($graph, $conflicts);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub _print_graph_line {
|
||||||
|
my $graph_thread = shift;
|
||||||
|
my $sep = shift;
|
||||||
|
for (my $col = 0; $col <= $#{$graph_thread}; ++$col) {
|
||||||
|
my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c;
|
||||||
|
print $c, $sep;
|
||||||
|
}
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub _time_col {
|
||||||
|
my $time_per = shift;
|
||||||
|
my $time = shift;
|
||||||
|
return int($time/$time_per);
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
sub write_vcd {
|
||||||
|
my $filename = shift;
|
||||||
|
print "Writing $filename\n";
|
||||||
|
my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,";
|
||||||
|
my $vcd = {values => {}, # {<time>}{<code>} = value
|
||||||
|
sigs => {}, # {<module>}{<sig}} = code
|
||||||
|
code => 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
my %parallelism;
|
||||||
|
foreach my $thread (keys %Threads) {
|
||||||
|
my $mcode = ($vcd->{sigs}{threads}{"thread${thread}_mtask"} ||= $vcd->{code}++);
|
||||||
|
foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) {
|
||||||
|
my $end = $Threads{$thread}{$start}{end};
|
||||||
|
my $mtask = $Threads{$thread}{$start}{mtask};
|
||||||
|
my $cpu = $Threads{$thread}{$start}{cpu};
|
||||||
|
$vcd->{values}{$start}{$mcode} = $mtask;
|
||||||
|
$vcd->{values}{$end}{$mcode} = undef;
|
||||||
|
$parallelism{$start}++;
|
||||||
|
$parallelism{$end}--;
|
||||||
|
|
||||||
|
my $ccode = $vcd->{sigs}{cpus}{"cpu${cpu}_thread"} ||= $vcd->{code}++;
|
||||||
|
$vcd->{values}{$start}{$ccode} = $thread;
|
||||||
|
$vcd->{values}{$end}{$ccode} = undef;
|
||||||
|
|
||||||
|
my $mcode = $vcd->{sigs}{mtasks}{"mtask${mtask}_cpu"} ||= $vcd->{code}++;
|
||||||
|
$vcd->{values}{$start}{$mcode} = $cpu;
|
||||||
|
$vcd->{values}{$end}{$mcode} = undef;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
my $pcode = ($vcd->{sigs}{Stats}{"parallelism"} ||= $vcd->{code}++);
|
||||||
|
my $value = 0;
|
||||||
|
foreach my $time (sort {$a<=>$b} keys %parallelism) {
|
||||||
|
$value += $parallelism{$time};
|
||||||
|
$vcd->{values}{$time}{$pcode} = $value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$fh->print('$version Generated by verilator_gantt $end'."\n");
|
||||||
|
$fh->print('$timescale 1ns $end'."\n");
|
||||||
|
$fh->print("\n");
|
||||||
|
|
||||||
|
my %all_codes;
|
||||||
|
$fh->print(' $scope module gantt $end'."\n");
|
||||||
|
foreach my $module (sort keys %{$vcd->{sigs}}) {
|
||||||
|
$fh->printf(' $scope module %s $end'."\n", $module);
|
||||||
|
foreach my $sig (sort keys %{$vcd->{sigs}{$module}}) {
|
||||||
|
my $code = $vcd->{sigs}{$module}{$sig};
|
||||||
|
$fh->printf(' $var wire 32 v%x %s [31:0] $end'."\n",
|
||||||
|
$code, $sig);
|
||||||
|
$all_codes{$code} = 1;
|
||||||
|
}
|
||||||
|
$fh->print(' $upscope $end'."\n");
|
||||||
|
}
|
||||||
|
$fh->print(' $upscope $end'."\n");
|
||||||
|
$fh->print('$enddefinitions $end'."\n");
|
||||||
|
$fh->print("\n");
|
||||||
|
|
||||||
|
my $first = 1;
|
||||||
|
foreach my $time (sort {$a <=> $b} keys %{$vcd->{values}}) {
|
||||||
|
if ($first) {
|
||||||
|
$first = 0;
|
||||||
|
# Start with Z for any signals without time zero data
|
||||||
|
foreach my $code (keys %all_codes) {
|
||||||
|
if (!defined $vcd->{values}{$time}{$code}) {
|
||||||
|
$vcd->{values}{$time}{$code} = undef;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$fh->printf("#%d\n", $time);
|
||||||
|
foreach my $code (sort keys %{$vcd->{values}{$time}}) {
|
||||||
|
my $value = $vcd->{values}{$time}{$code};
|
||||||
|
if (defined $value) {
|
||||||
|
$fh->printf("b%b v%x\n", $value, $code);
|
||||||
|
} else {
|
||||||
|
$fh->printf("bz v%x\n", $code);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
# Similar to Statistics::Basic functions, but avoid a package dependency
|
||||||
|
|
||||||
|
sub max {
|
||||||
|
my $n = $_[0]; shift;
|
||||||
|
while (defined $_[0]) {
|
||||||
|
$n = $_[0] if !defined $n || $_[0] > $n;
|
||||||
|
shift;
|
||||||
|
}
|
||||||
|
return $n;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub mean {
|
||||||
|
my $arrayref = shift;
|
||||||
|
my $n = 0;
|
||||||
|
my $sum = 0;
|
||||||
|
foreach my $v (@$arrayref) {
|
||||||
|
$sum += $v;
|
||||||
|
$n++;
|
||||||
|
}
|
||||||
|
return undef if !$n;
|
||||||
|
return $sum/$n;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub stddev {
|
||||||
|
my $arrayref = shift;
|
||||||
|
my $n = 0;
|
||||||
|
my $sum = 0;
|
||||||
|
my $sumsq = 0;
|
||||||
|
foreach my $v (@$arrayref) {
|
||||||
|
$sum += $v;
|
||||||
|
$sumsq += $v**2;
|
||||||
|
$n++;
|
||||||
|
}
|
||||||
|
return undef if !$n;
|
||||||
|
return sqrt(($sumsq/$n) - ($sum/$n)**2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
__END__
|
||||||
|
|
||||||
|
=pod
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
verilator_gantt - Create Gantt chart of multi-threaded execution
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
Creates a visual representation to help analyze Verilator multithreaded
|
||||||
|
simulation performance, by showing when each macro-task starts and ends,
|
||||||
|
and showing when each thread is busy or idle.
|
||||||
|
|
||||||
|
The generated Gantt chart has time on the X-axis. Times shown are to the
|
||||||
|
scale printed, i.e. a certain about of time for each character width. The
|
||||||
|
Y-axis shows threads, each thread's execution is shown on one line. That
|
||||||
|
line shows "[" at the position in time when it executes.
|
||||||
|
|
||||||
|
Following the "[" is the cpu number the task executed on, followed by zero
|
||||||
|
or more "-" to make the width of the characters match the scaled execution
|
||||||
|
time, followed by a "]". If the scale is too small, the cpu number and
|
||||||
|
mtask number will not be printed. If the scale is very small, a "&"
|
||||||
|
indicates multiple mtasks started at that time position.
|
||||||
|
|
||||||
|
Also creates a value change dump (VCD) format dump file which may be viewed
|
||||||
|
in a waveform viewer (e.g. C<GTKWave>). See below.
|
||||||
|
|
||||||
|
=head1 USAGE
|
||||||
|
|
||||||
|
Build with --prof-threads.
|
||||||
|
|
||||||
|
Run a sim with +verilator+prof+threads+window 2.
|
||||||
|
|
||||||
|
This will create profile_threads.dat.
|
||||||
|
|
||||||
|
Then run:
|
||||||
|
|
||||||
|
verilator_gantt profile_threads.dat
|
||||||
|
|
||||||
|
The report will be printed on standard output, this also generates
|
||||||
|
profile_threads.vcd
|
||||||
|
|
||||||
|
View profile_threads.vcd in a waveform viewer.
|
||||||
|
|
||||||
|
=head1 VCD SIGNALS
|
||||||
|
|
||||||
|
In waveforms there are the following signals. Most signals the "decimal"
|
||||||
|
format will remove the leading zeros and make the traces easier to read.
|
||||||
|
|
||||||
|
parallelism: The number of mtasks active at this time, for best performance
|
||||||
|
this will match the thread count. You may want to use an "analog step"
|
||||||
|
format to view this signal.
|
||||||
|
|
||||||
|
cpu#_thread: For the given CPU number, the thread number executing.
|
||||||
|
|
||||||
|
mtask#_cpu; For the given mtask id, the CPU it is executing on.
|
||||||
|
|
||||||
|
thread#_mtask: For the given thread number, the mtask id executing.
|
||||||
|
|
||||||
|
=head1 ARGUMENTS
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item I<filename>
|
||||||
|
|
||||||
|
The filename to read data from, defaults to "profile_threads.dat".
|
||||||
|
|
||||||
|
=item --help
|
||||||
|
|
||||||
|
Displays this message and program version and exits.
|
||||||
|
|
||||||
|
=item --scale I<n>
|
||||||
|
|
||||||
|
On the X-axis of the generated Gantt chart, each character represents this
|
||||||
|
many time units. (On x86, time units are rdtsc ticks.) Defaults to 0,
|
||||||
|
which will automatically compute a reasonable scale where no two mtasks
|
||||||
|
need to fit into same character width's worth of scaled time.
|
||||||
|
|
||||||
|
=item --no-vcd
|
||||||
|
|
||||||
|
=item --vcd I<filename>
|
||||||
|
|
||||||
|
Set output filename for vcd dump, or disable. Default is
|
||||||
|
verilator_gantt.vcd.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head1 DISTRIBUTION
|
||||||
|
|
||||||
|
The latest version is available from L<http://www.veripool.org/>.
|
||||||
|
|
||||||
|
Copyright 2018-2018 by Wilson Snyder. Verilator is free software; you can
|
||||||
|
redistribute it and/or modify it under the terms of either the GNU Lesser
|
||||||
|
General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||||
|
|
||||||
|
=head1 AUTHORS
|
||||||
|
|
||||||
|
Wilson Snyder <wsnyder@wsnyder.org>
|
||||||
|
|
||||||
|
=head1 SEE ALSO
|
||||||
|
|
||||||
|
C<verilator>
|
||||||
|
|
||||||
|
=cut
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
### Local Variables:
|
||||||
|
### compile-command: "$V4/bin/verilator_gantt $V4/test_regress/obj_vltmt/t_gantt/vlt_sim.log"
|
||||||
|
### End:
|
||||||
|
|
@ -38,6 +38,7 @@ VerilatedVoidCb Verilated::s_flushCb = NULL;
|
||||||
|
|
||||||
// Keep below together in one cache line
|
// Keep below together in one cache line
|
||||||
Verilated::Serialized Verilated::s_s;
|
Verilated::Serialized Verilated::s_s;
|
||||||
|
Verilated::NonSerialized Verilated::s_ns;
|
||||||
VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
|
VL_THREAD_LOCAL Verilated::ThreadLocal Verilated::t_s;
|
||||||
|
|
||||||
Verilated::CommandArgValues Verilated::s_args;
|
Verilated::CommandArgValues Verilated::s_args;
|
||||||
|
|
@ -196,6 +197,17 @@ Verilated::Serialized::Serialized() {
|
||||||
s_fatalOnVpiError = true; // retains old default behaviour
|
s_fatalOnVpiError = true; // retains old default behaviour
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Verilated::NonSerialized::NonSerialized() {
|
||||||
|
s_profThreadsStart = 1;
|
||||||
|
s_profThreadsWindow = 2;
|
||||||
|
s_profThreadsFilenamep = strdup("profile_threads.dat");
|
||||||
|
}
|
||||||
|
Verilated::NonSerialized::~NonSerialized() {
|
||||||
|
if (s_profThreadsFilenamep) {
|
||||||
|
free(const_cast<char*>(s_profThreadsFilenamep)); s_profThreadsFilenamep=NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//===========================================================================
|
//===========================================================================
|
||||||
// Random reset -- Only called at init time, so don't inline.
|
// Random reset -- Only called at init time, so don't inline.
|
||||||
|
|
||||||
|
|
@ -1648,6 +1660,20 @@ void Verilated::fatalOnVpiError(bool flag) VL_MT_SAFE {
|
||||||
VerilatedLockGuard lock(m_mutex);
|
VerilatedLockGuard lock(m_mutex);
|
||||||
s_s.s_fatalOnVpiError = flag;
|
s_s.s_fatalOnVpiError = flag;
|
||||||
}
|
}
|
||||||
|
void Verilated::profThreadsStart(vluint64_t flag) VL_MT_SAFE {
|
||||||
|
VerilatedLockGuard lock(m_mutex);
|
||||||
|
s_ns.s_profThreadsStart = flag;
|
||||||
|
}
|
||||||
|
void Verilated::profThreadsWindow(vluint64_t flag) VL_MT_SAFE {
|
||||||
|
VerilatedLockGuard lock(m_mutex);
|
||||||
|
s_ns.s_profThreadsWindow = flag;
|
||||||
|
}
|
||||||
|
void Verilated::profThreadsFilenamep(const char* flagp) VL_MT_SAFE {
|
||||||
|
VerilatedLockGuard lock(m_mutex);
|
||||||
|
if (s_ns.s_profThreadsFilenamep) free(const_cast<char*>(s_ns.s_profThreadsFilenamep));
|
||||||
|
s_ns.s_profThreadsFilenamep = strdup(flagp);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
|
const char* Verilated::catName(const char* n1, const char* n2) VL_MT_SAFE {
|
||||||
// Returns new'ed data
|
// Returns new'ed data
|
||||||
|
|
@ -1800,6 +1826,15 @@ void VerilatedImp::commandArgVl(const std::string& arg) {
|
||||||
VL_PRINTF_MT("For help, please see 'verilator --help'\n");
|
VL_PRINTF_MT("For help, please see 'verilator --help'\n");
|
||||||
VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
|
VL_FATAL_MT("COMMAND_LINE", 0, "", "Exiting due to command line argument (not an error)");
|
||||||
}
|
}
|
||||||
|
else if (commandArgVlValue(arg, "+verilator+prof+threads+start+", value/*ref*/)) {
|
||||||
|
Verilated::profThreadsStart(atoll(value.c_str()));
|
||||||
|
}
|
||||||
|
else if (commandArgVlValue(arg, "+verilator+prof+threads+window+", value/*ref*/)) {
|
||||||
|
Verilated::profThreadsWindow(atol(value.c_str()));
|
||||||
|
}
|
||||||
|
else if (commandArgVlValue(arg, "+verilator+prof+threads+file+", value/*ref*/)) {
|
||||||
|
Verilated::profThreadsFilenamep(value.c_str());
|
||||||
|
}
|
||||||
else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
|
else if (commandArgVlValue(arg, "+verilator+rand+reset+", value/*ref*/)) {
|
||||||
Verilated::randReset(atoi(value.c_str()));
|
Verilated::randReset(atoi(value.c_str()));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -344,6 +344,17 @@ class Verilated {
|
||||||
~Serialized() {}
|
~Serialized() {}
|
||||||
} s_s;
|
} s_s;
|
||||||
|
|
||||||
|
static struct NonSerialized { // Non-serialized information
|
||||||
|
// These are reloaded from on command-line settings, so do not need to persist
|
||||||
|
// Fast path
|
||||||
|
vluint64_t s_profThreadsStart; ///< +prof+threads starting time
|
||||||
|
vluint32_t s_profThreadsWindow; ///< +prof+threads window size
|
||||||
|
// Slow path
|
||||||
|
const char* s_profThreadsFilenamep; ///< +prof+threads filename
|
||||||
|
NonSerialized();
|
||||||
|
~NonSerialized();
|
||||||
|
} s_ns;
|
||||||
|
|
||||||
// no need to be save-restored (serialized) the
|
// no need to be save-restored (serialized) the
|
||||||
// assumption is that the restore is allowed to pass different arguments
|
// assumption is that the restore is allowed to pass different arguments
|
||||||
static struct CommandArgValues {
|
static struct CommandArgValues {
|
||||||
|
|
@ -409,6 +420,14 @@ public:
|
||||||
/// Enable/disable vpi fatal
|
/// Enable/disable vpi fatal
|
||||||
static void fatalOnVpiError(bool flag) VL_MT_SAFE;
|
static void fatalOnVpiError(bool flag) VL_MT_SAFE;
|
||||||
static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
|
static bool fatalOnVpiError() VL_MT_SAFE { return s_s.s_fatalOnVpiError; }
|
||||||
|
/// --prof-threads related settings
|
||||||
|
static void profThreadsStart(vluint64_t flag) VL_MT_SAFE;
|
||||||
|
static vluint64_t profThreadsStart() VL_MT_SAFE { return s_ns.s_profThreadsStart; }
|
||||||
|
static void profThreadsWindow(vluint64_t flag) VL_MT_SAFE;
|
||||||
|
static vluint32_t profThreadsWindow() VL_MT_SAFE { return s_ns.s_profThreadsWindow; }
|
||||||
|
static void profThreadsFilenamep(const char* flagp) VL_MT_SAFE;
|
||||||
|
static const char* profThreadsFilenamep() VL_MT_SAFE { return s_ns.s_profThreadsFilenamep; }
|
||||||
|
|
||||||
/// Flush callback for VCD waves
|
/// Flush callback for VCD waves
|
||||||
static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
|
static void flushCb(VerilatedVoidCb cb) VL_MT_SAFE;
|
||||||
static void flushCall() VL_MT_SAFE;
|
static void flushCall() VL_MT_SAFE;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,229 @@
|
||||||
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||||
|
//=============================================================================
|
||||||
|
//
|
||||||
|
// THIS MODULE IS PUBLICLY LICENSED
|
||||||
|
//
|
||||||
|
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
|
||||||
|
// you can redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||||
|
//
|
||||||
|
// This is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
// for more details.
|
||||||
|
//
|
||||||
|
//=============================================================================
|
||||||
|
///
|
||||||
|
/// \file
|
||||||
|
/// \brief Thread pool for verilated modules
|
||||||
|
///
|
||||||
|
//=============================================================================
|
||||||
|
|
||||||
|
#include "verilatedos.h"
|
||||||
|
#include "verilated_threads.h"
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
std::atomic<vluint64_t> VlNotification::s_yields;
|
||||||
|
|
||||||
|
VL_THREAD_LOCAL VlThreadPool::ProfileTrace* VlThreadPool::t_profilep = NULL;
|
||||||
|
|
||||||
|
//=============================================================================
|
||||||
|
// VlMTaskVertex
|
||||||
|
|
||||||
|
VlMTaskVertex::VlMTaskVertex(vluint32_t upstreamDepCount)
|
||||||
|
: m_upstreamDepsDone(0),
|
||||||
|
m_upstreamDepCount(upstreamDepCount) {
|
||||||
|
assert(atomic_is_lock_free(&m_upstreamDepsDone));
|
||||||
|
}
|
||||||
|
|
||||||
|
//=============================================================================
|
||||||
|
// VlWorkerThread
|
||||||
|
|
||||||
|
VlWorkerThread::VlWorkerThread(VlThreadPool* poolp, bool profiling)
|
||||||
|
: m_poolp(poolp)
|
||||||
|
, m_profiling(profiling)
|
||||||
|
, m_exiting(false)
|
||||||
|
// Must init this last -- after setting up fields that it might read:
|
||||||
|
, m_cthread(startWorker, this) {}
|
||||||
|
|
||||||
|
VlWorkerThread::~VlWorkerThread() {
|
||||||
|
m_exiting.store(true, std::memory_order_release);
|
||||||
|
{
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
if (sleeping()) {
|
||||||
|
wakeUp();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// The thread should exit; join it.
|
||||||
|
m_cthread.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlWorkerThread::workerLoop() {
|
||||||
|
if (VL_UNLIKELY(m_profiling)) {
|
||||||
|
m_poolp->setupProfilingClientThread();
|
||||||
|
}
|
||||||
|
|
||||||
|
VlNotification alarm;
|
||||||
|
ExecRec work;
|
||||||
|
work.m_fnp = NULL;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
bool sleep = false;
|
||||||
|
if (VL_UNLIKELY(!work.m_fnp)) {
|
||||||
|
// Look for work
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
if (VL_LIKELY(!m_ready.empty())) {
|
||||||
|
dequeWork(&work);
|
||||||
|
} else {
|
||||||
|
// No work available, prepare to sleep. Pass alarm/work
|
||||||
|
// into m_sleepAlarm so wakeUp will tall this function.
|
||||||
|
//
|
||||||
|
// Must modify m_sleepAlarm in the same critical section as
|
||||||
|
// the check for ready work, otherwise we could race with
|
||||||
|
// another thread enqueueing work and never be awoken.
|
||||||
|
m_sleepAlarm.first = &alarm;
|
||||||
|
m_sleepAlarm.second = &work;
|
||||||
|
sleep = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do this here, not above, to avoid a race with the destructor.
|
||||||
|
if (VL_UNLIKELY(m_exiting.load(std::memory_order_acquire)))
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (VL_UNLIKELY(sleep)) {
|
||||||
|
alarm.waitForNotification(); // ZZZzzzzz
|
||||||
|
alarm.reset();
|
||||||
|
}
|
||||||
|
if (VL_LIKELY(work.m_fnp)) {
|
||||||
|
work.m_fnp(work.m_evenCycle, work.m_sym);
|
||||||
|
work.m_fnp = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VL_UNLIKELY(m_profiling)) {
|
||||||
|
m_poolp->tearDownProfilingClientThread();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlWorkerThread::startWorker(VlWorkerThread* workerp) {
|
||||||
|
workerp->workerLoop();
|
||||||
|
}
|
||||||
|
|
||||||
|
//=============================================================================
|
||||||
|
// VlThreadPool
|
||||||
|
|
||||||
|
VlThreadPool::VlThreadPool(int nThreads, bool profiling)
|
||||||
|
: m_profiling(profiling) {
|
||||||
|
// --threads N passes nThreads=N-1, as the "main" threads counts as 1
|
||||||
|
unsigned cpus = std::thread::hardware_concurrency();
|
||||||
|
if (cpus < nThreads+1) {
|
||||||
|
VL_PRINTF_MT("%%Warning: System has %u CPUs but model Verilated with"
|
||||||
|
" --threads %d; may run slow.\n", cpus, nThreads+1);
|
||||||
|
}
|
||||||
|
// Create'em
|
||||||
|
for (int i=0; i<nThreads; ++i) {
|
||||||
|
m_workers.push_back(new VlWorkerThread(this, profiling));
|
||||||
|
}
|
||||||
|
// Set up a profile buffer for the current thread too -- on the
|
||||||
|
// assumption that it's the same thread that calls eval and may be
|
||||||
|
// donated to run mtasks during the eval.
|
||||||
|
if (VL_UNLIKELY(m_profiling)) {
|
||||||
|
setupProfilingClientThread();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
VlThreadPool::~VlThreadPool() {
|
||||||
|
for (int i = 0; i < m_workers.size(); ++i) {
|
||||||
|
// Each ~WorkerThread will wait for its thread to exit.
|
||||||
|
delete m_workers[i];
|
||||||
|
}
|
||||||
|
if (VL_UNLIKELY(m_profiling)) {
|
||||||
|
tearDownProfilingClientThread();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlThreadPool::tearDownProfilingClientThread() {
|
||||||
|
assert(t_profilep);
|
||||||
|
delete t_profilep;
|
||||||
|
t_profilep = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlThreadPool::setupProfilingClientThread() {
|
||||||
|
assert(!t_profilep);
|
||||||
|
t_profilep = new ProfileTrace;
|
||||||
|
// Reserve some space in the thread-local profiling buffer;
|
||||||
|
// try not to malloc while collecting profiling.
|
||||||
|
t_profilep->reserve(4096);
|
||||||
|
{
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
m_allProfiles.insert(t_profilep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlThreadPool::profileAppendAll(const VlProfileRec& rec) {
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
for (ProfileSet::iterator it = m_allProfiles.begin();
|
||||||
|
it != m_allProfiles.end(); ++it) {
|
||||||
|
// Every thread's profile trace gets a copy of rec.
|
||||||
|
(*it)->emplace_back(rec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VlThreadPool::profileDump(const char* filenamep, vluint64_t ticksElapsed) {
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
VL_DEBUG_IF(VL_DBG_MSGF("+prof+threads writing to '%s'\n", filenamep););
|
||||||
|
|
||||||
|
FILE* fp = fopen(filenamep, "w");
|
||||||
|
if (VL_UNLIKELY(!fp)) {
|
||||||
|
VL_FATAL_MT(filenamep, 0, "", "+prof+threads+file file not writable");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO Perhaps merge with verilated_coverage output format, so can
|
||||||
|
// have a common merging and reporting tool, etc.
|
||||||
|
fprintf(fp, "VLPROFTHREAD 1.0 # Verilator thread profile dump version 1.0\n");
|
||||||
|
fprintf(fp, "VLPROF arg --threads %" VL_PRI64 "u\n",
|
||||||
|
vluint64_t(m_workers.size()+1));
|
||||||
|
fprintf(fp, "VLPROF arg +verilator+prof+threads+start+%" VL_PRI64 "u\n",
|
||||||
|
Verilated::profThreadsStart());
|
||||||
|
fprintf(fp, "VLPROF arg +verilator+prof+threads+window+%u\n",
|
||||||
|
Verilated::profThreadsWindow());
|
||||||
|
fprintf(fp, "VLPROF stat yields %" VL_PRI64 "u\n",
|
||||||
|
VlNotification::yields());
|
||||||
|
|
||||||
|
vluint32_t thread_id = 0;
|
||||||
|
for (ProfileSet::iterator pit = m_allProfiles.begin();
|
||||||
|
pit != m_allProfiles.end(); ++pit) {
|
||||||
|
++thread_id;
|
||||||
|
|
||||||
|
bool printing = false; // False while in warmup phase
|
||||||
|
for (ProfileTrace::iterator eit = (*pit)->begin();
|
||||||
|
eit != (*pit)->end(); ++eit) {
|
||||||
|
switch (eit->m_type) {
|
||||||
|
case VlProfileRec::TYPE_BARRIER:
|
||||||
|
printing = true;
|
||||||
|
break;
|
||||||
|
case VlProfileRec::TYPE_MTASK_RUN:
|
||||||
|
if (!printing) break;
|
||||||
|
fprintf(fp, "VLPROF mtask %d"
|
||||||
|
" start %" VL_PRI64"u end %" VL_PRI64"u elapsed %" VL_PRI64 "u"
|
||||||
|
" predict_time %u cpu %u on thread %u\n",
|
||||||
|
eit->m_mtaskId,
|
||||||
|
eit->m_startTime,
|
||||||
|
eit->m_endTime,
|
||||||
|
(eit->m_endTime - eit->m_startTime),
|
||||||
|
eit->m_predictTime,
|
||||||
|
eit->m_cpu,
|
||||||
|
thread_id);
|
||||||
|
break;
|
||||||
|
default: assert(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fprintf(fp, "VLPROF stat ticks %" VL_PRI64 "u\n",
|
||||||
|
ticksElapsed);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,313 @@
|
||||||
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||||
|
//=============================================================================
|
||||||
|
//
|
||||||
|
// THIS MODULE IS PUBLICLY LICENSED
|
||||||
|
//
|
||||||
|
// Copyright 2012-2018 by Wilson Snyder. This program is free software;
|
||||||
|
// you can redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License Version 2.0.
|
||||||
|
//
|
||||||
|
// This is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
// for more details.
|
||||||
|
//
|
||||||
|
//=============================================================================
|
||||||
|
///
|
||||||
|
/// \file
|
||||||
|
/// \brief Thread pool and profiling for Verilated modules
|
||||||
|
///
|
||||||
|
//=============================================================================
|
||||||
|
|
||||||
|
#ifndef _VERILATED_THREADS_H_
|
||||||
|
#define _VERILATED_THREADS_H_
|
||||||
|
|
||||||
|
#include "verilatedos.h"
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <set>
|
||||||
|
#include <sched.h> // For sched_getcpu()
|
||||||
|
|
||||||
|
#include "verilated.h" // for VerilatedMutex and clang annotations
|
||||||
|
|
||||||
|
// VlMTaskVertex and VlThreadpool will work with multiple symbol table types.
|
||||||
|
// Since the type is opaque to VlMTaskVertex and VlThreadPool, represent it
|
||||||
|
// as a void* here.
|
||||||
|
typedef void* VlThrSymTab;
|
||||||
|
|
||||||
|
class VlNotification {
|
||||||
|
// MEMBERS
|
||||||
|
std::atomic<bool> m_notified; // Notification pending
|
||||||
|
static std::atomic<vluint64_t> s_yields; // Statistics
|
||||||
|
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
VlNotification()
|
||||||
|
: m_notified(false) {
|
||||||
|
assert(atomic_is_lock_free(&m_notified));
|
||||||
|
}
|
||||||
|
~VlNotification() {}
|
||||||
|
|
||||||
|
// METHODS
|
||||||
|
static vluint64_t yields() { return s_yields; }
|
||||||
|
|
||||||
|
// Block until notify() has occurred, then return.
|
||||||
|
// If notify() has already occurred, return immediately.
|
||||||
|
//
|
||||||
|
// This is logically const: the object will remain in notified state
|
||||||
|
// after WaitForNotification() returns, so you could notify more than
|
||||||
|
// one thread of the same event.
|
||||||
|
inline void waitForNotification() {
|
||||||
|
unsigned ct = 0;
|
||||||
|
while (VL_UNLIKELY(!notified())) {
|
||||||
|
VL_CPU_RELAX();
|
||||||
|
ct++;
|
||||||
|
if (VL_UNLIKELY(ct > VL_LOCK_SPINS)) {
|
||||||
|
ct = 0;
|
||||||
|
++s_yields; // Statistics
|
||||||
|
std::this_thread::yield();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The 'inline' keyword here means nothing to the compiler, it's
|
||||||
|
// implicit on methods defined within the class body anyway.
|
||||||
|
//
|
||||||
|
// 'inline' is attached the this method, and others in this file,
|
||||||
|
// to remind humans that some routines in this file are called many
|
||||||
|
// times per cycle in threaded mode. Such routines should be
|
||||||
|
// inlinable; that's why they're declared in the .h and not the .cpp.
|
||||||
|
inline bool notified() {
|
||||||
|
return m_notified.load(std::memory_order_acquire);
|
||||||
|
}
|
||||||
|
// Set notified state. If state is already notified,
|
||||||
|
// it remains so.
|
||||||
|
inline void notify() {
|
||||||
|
m_notified.store(true, std::memory_order_release);
|
||||||
|
}
|
||||||
|
// Reset the state to un-notified state, which is also the
|
||||||
|
// state of a new Notification object.
|
||||||
|
inline void reset() {
|
||||||
|
m_notified.store(false, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef void (*VlExecFnp)(bool, VlThrSymTab);
|
||||||
|
|
||||||
|
/// Track dependencies for a single MTask.
|
||||||
|
class VlMTaskVertex {
|
||||||
|
// MEMBERS
|
||||||
|
|
||||||
|
// On even cycles, _upstreamDepsDone increases as upstream
|
||||||
|
// dependencies complete. When it reaches _upstreamDepCount,
|
||||||
|
// this MTaskVertex is ready.
|
||||||
|
//
|
||||||
|
// On odd cycles, _upstreamDepsDone decreases as upstream
|
||||||
|
// dependencies complete, and when it reaches zero this MTaskVertex
|
||||||
|
// is ready.
|
||||||
|
//
|
||||||
|
// An atomic is smaller than a mutex, and lock-free.
|
||||||
|
//
|
||||||
|
// (Why does the size of this class matter? If an mtask has many
|
||||||
|
// downstream mtasks to notify, we hope these will pack into a
|
||||||
|
// small number of cache lines to reduce the cost of pointer chasing
|
||||||
|
// during done-notification. Nobody's quantified that cost though.
|
||||||
|
// If we were really serious about shrinking this class, we could
|
||||||
|
// use 16-bit types here...)
|
||||||
|
std::atomic<vluint32_t> m_upstreamDepsDone;
|
||||||
|
const vluint32_t m_upstreamDepCount;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
|
||||||
|
// 'upstreamDepCount' is the number of upstream MTaskVertex's
|
||||||
|
// that must notify this MTaskVertex before it will become ready
|
||||||
|
// to run.
|
||||||
|
explicit VlMTaskVertex(vluint32_t upstreamDepCount);
|
||||||
|
~VlMTaskVertex() {}
|
||||||
|
|
||||||
|
// Upstream mtasks must call this when they complete.
|
||||||
|
// Returns true when the current MTaskVertex becomes ready to execute,
|
||||||
|
// false while it's still waiting on more dependencies.
|
||||||
|
inline bool signalUpstreamDone(bool evenCycle) {
|
||||||
|
if (evenCycle) {
|
||||||
|
vluint32_t upstreamDepsDone
|
||||||
|
= 1 + m_upstreamDepsDone.fetch_add(1, std::memory_order_release);
|
||||||
|
assert(upstreamDepsDone <= m_upstreamDepCount);
|
||||||
|
return (upstreamDepsDone == m_upstreamDepCount);
|
||||||
|
} else {
|
||||||
|
vluint32_t upstreamDepsDone_prev
|
||||||
|
= m_upstreamDepsDone.fetch_sub(1, std::memory_order_release);
|
||||||
|
assert(upstreamDepsDone_prev > 0);
|
||||||
|
return (upstreamDepsDone_prev == 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline bool areUpstreamDepsDone(bool evenCycle) const {
|
||||||
|
vluint32_t target = evenCycle ? m_upstreamDepCount : 0;
|
||||||
|
return m_upstreamDepsDone.load(std::memory_order_acquire) == target;
|
||||||
|
}
|
||||||
|
inline void waitUntilUpstreamDone(bool evenCycle) const {
|
||||||
|
while (VL_UNLIKELY(!areUpstreamDepsDone(evenCycle))) {
|
||||||
|
VL_CPU_RELAX();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Profiling support
|
||||||
|
class VlProfileRec {
|
||||||
|
protected:
|
||||||
|
friend class VlThreadPool;
|
||||||
|
enum VlProfileE {
|
||||||
|
TYPE_MTASK_RUN,
|
||||||
|
TYPE_BARRIER
|
||||||
|
};
|
||||||
|
VlProfileE m_type; // Record type
|
||||||
|
vluint32_t m_mtaskId; // Mtask we're logging
|
||||||
|
vluint32_t m_predictTime; // How long scheduler predicted would take
|
||||||
|
vluint64_t m_startTime; // Tick at start of execution
|
||||||
|
vluint64_t m_endTime; // Tick at end of execution
|
||||||
|
unsigned m_cpu; // Execution CPU number (at start anyways)
|
||||||
|
public:
|
||||||
|
class Barrier {};
|
||||||
|
VlProfileRec() {}
|
||||||
|
explicit VlProfileRec(Barrier) {
|
||||||
|
m_type = TYPE_BARRIER;
|
||||||
|
m_mtaskId = 0;
|
||||||
|
m_predictTime = 0;
|
||||||
|
m_startTime = 0;
|
||||||
|
m_cpu = sched_getcpu();
|
||||||
|
}
|
||||||
|
void startRecord(vluint64_t time, uint32_t mtask, uint32_t predict) {
|
||||||
|
m_type = VlProfileRec::TYPE_MTASK_RUN;
|
||||||
|
m_mtaskId = mtask;
|
||||||
|
m_predictTime = predict;
|
||||||
|
m_startTime = time;
|
||||||
|
m_cpu = sched_getcpu();
|
||||||
|
}
|
||||||
|
void endRecord(vluint64_t time) {
|
||||||
|
m_endTime = time;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class VlThreadPool;
|
||||||
|
|
||||||
|
class VlWorkerThread {
|
||||||
|
private:
|
||||||
|
// TYPES
|
||||||
|
struct ExecRec {
|
||||||
|
VlExecFnp m_fnp; // Function to execute
|
||||||
|
VlThrSymTab m_sym; // Symbol table to execute
|
||||||
|
bool m_evenCycle; // Even/odd for flag alternation
|
||||||
|
ExecRec() : m_fnp(NULL), m_sym(NULL), m_evenCycle(false) {}
|
||||||
|
ExecRec(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym)
|
||||||
|
: m_fnp(fnp), m_sym(sym), m_evenCycle(evenCycle) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// MEMBERS
|
||||||
|
VerilatedMutex m_mutex;
|
||||||
|
|
||||||
|
// Why a vector? We expect the pending list to be very short, typically
|
||||||
|
// 0 or 1 or 2, so popping from the front shouldn't be
|
||||||
|
// expensive. Revisit if we ever have longer queues...
|
||||||
|
std::vector<ExecRec> m_ready VL_GUARDED_BY(m_mutex);
|
||||||
|
|
||||||
|
VlThreadPool* m_poolp; // Our associated thread pool
|
||||||
|
|
||||||
|
// If values stored are non-NULL, the thread is asleep pending new
|
||||||
|
// work. If the thread is not asleep, both parts of m_sleepAlarm must
|
||||||
|
// be NULL.
|
||||||
|
std::pair<VlNotification*, ExecRec*> m_sleepAlarm VL_GUARDED_BY(m_mutex);
|
||||||
|
|
||||||
|
bool m_profiling; // Is profiling enabled?
|
||||||
|
std::atomic<bool> m_exiting; // Worker thread should exit
|
||||||
|
std::thread m_cthread; // Underlying C++ thread record
|
||||||
|
|
||||||
|
VL_UNCOPYABLE(VlWorkerThread);
|
||||||
|
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
explicit VlWorkerThread(VlThreadPool* poolp, bool profiling);
|
||||||
|
~VlWorkerThread();
|
||||||
|
|
||||||
|
// METHODS
|
||||||
|
inline void dequeWork(ExecRec* workp) VL_REQUIRES(m_mutex) {
|
||||||
|
// As noted above this is inefficient if our ready list is ever
|
||||||
|
// long (but it shouldn't be)
|
||||||
|
*workp = m_ready.front();
|
||||||
|
m_ready.erase(m_ready.begin());
|
||||||
|
}
|
||||||
|
inline void wakeUp() VL_REQUIRES(m_mutex) {
|
||||||
|
VlNotification* notifyp = m_sleepAlarm.first;
|
||||||
|
m_sleepAlarm.first = NULL; // NULL+NULL means wake
|
||||||
|
m_sleepAlarm.second = NULL;
|
||||||
|
notifyp->notify();
|
||||||
|
}
|
||||||
|
inline bool sleeping() VL_REQUIRES(m_mutex) {
|
||||||
|
return (m_sleepAlarm.first != NULL);
|
||||||
|
}
|
||||||
|
inline void addTask(VlExecFnp fnp, bool evenCycle, VlThrSymTab sym) {
|
||||||
|
VerilatedLockGuard lk(m_mutex);
|
||||||
|
m_ready.emplace_back(fnp, evenCycle, sym);
|
||||||
|
if (VL_LIKELY(sleeping())) { // Generally queue is waiting for work
|
||||||
|
// Awaken thread
|
||||||
|
dequeWork(m_sleepAlarm.second);
|
||||||
|
wakeUp();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void workerLoop();
|
||||||
|
static void startWorker(VlWorkerThread* workerp);
|
||||||
|
};
|
||||||
|
|
||||||
|
class VlThreadPool {
|
||||||
|
// TYPES
|
||||||
|
typedef std::vector<VlProfileRec> ProfileTrace;
|
||||||
|
typedef std::set<ProfileTrace*> ProfileSet;
|
||||||
|
|
||||||
|
// MEMBERS
|
||||||
|
std::vector<VlWorkerThread*> m_workers; // our workers
|
||||||
|
bool m_profiling; // is profiling enabled?
|
||||||
|
|
||||||
|
// Support profiling -- we can append records of profiling events
|
||||||
|
// to this vector with very low overhead, and then dump them out
|
||||||
|
// later. This prevents the overhead of printf/malloc/IO from
|
||||||
|
// corrupting the profiling data. It's super cheap to append
|
||||||
|
// a VlProfileRec struct on the end of a pre-allocated vector;
|
||||||
|
// this is the only cost we pay in real-time during a profiling cycle.
|
||||||
|
static VL_THREAD_LOCAL ProfileTrace* t_profilep;
|
||||||
|
ProfileSet m_allProfiles VL_GUARDED_BY(m_mutex);
|
||||||
|
VerilatedMutex m_mutex;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
// Construct a thread pool with 'nThreads' dedicated threads. The thread
|
||||||
|
// pool will create these threads and make them available to execute tasks
|
||||||
|
// via this->workerp(index)->addTask(...)
|
||||||
|
VlThreadPool(int nThreads, bool profiling);
|
||||||
|
~VlThreadPool();
|
||||||
|
|
||||||
|
// METHODS
|
||||||
|
inline int numThreads() const {
|
||||||
|
return m_workers.size();
|
||||||
|
}
|
||||||
|
inline VlWorkerThread* workerp(int index) {
|
||||||
|
assert(index >= 0);
|
||||||
|
assert(index < m_workers.size());
|
||||||
|
return m_workers[index];
|
||||||
|
}
|
||||||
|
inline VlProfileRec* profileAppend() {
|
||||||
|
t_profilep->emplace_back();
|
||||||
|
return &(t_profilep->back());
|
||||||
|
}
|
||||||
|
void profileAppendAll(const VlProfileRec& rec);
|
||||||
|
void profileDump(const char* filenamep, vluint64_t ticksElapsed);
|
||||||
|
// In profiling mode, each executing thread must call
|
||||||
|
// this once to setup profiling state:
|
||||||
|
void setupProfilingClientThread();
|
||||||
|
void tearDownProfilingClientThread();
|
||||||
|
private:
|
||||||
|
VL_UNCOPYABLE(VlThreadPool);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
215
internals.pod
215
internals.pod
|
|
@ -155,6 +155,221 @@ provided and documented in C<V3GraphAlg.cpp>.
|
||||||
|
|
||||||
=back
|
=back
|
||||||
|
|
||||||
|
=head2 Multithreaded Mode
|
||||||
|
|
||||||
|
In --threads mode, the frontend of the Verilator pipeline is the same as
|
||||||
|
serial mode, up until V3Order.
|
||||||
|
|
||||||
|
V3Order builds a fine-grained, statement-level dependency graph that governs
|
||||||
|
the ordering of code within a single eval() call. In serial mode, that
|
||||||
|
dependency graph is used to order all statements into a total serial order.
|
||||||
|
In parallel mode, the same dependency graph is the starting point for a
|
||||||
|
partitioner (V3Partition).
|
||||||
|
|
||||||
|
The partitioner's goal is to coarsen the fine-grained DAG into a coarser
|
||||||
|
DAG, while maintaining as much available parallelism as possible. Often the
|
||||||
|
partitioner can transform an input graph with millions of nodes into a
|
||||||
|
coarsened execution graph with a few dozen nodes, while maintaining enough
|
||||||
|
parallelism to take advantage of a modern multicore CPU. Runtime
|
||||||
|
synchronization cost is not prohibitive with so few nodes.
|
||||||
|
|
||||||
|
=head3 Partitioning
|
||||||
|
|
||||||
|
Our partitioner is similar to the one Vivek Sarkar described in his 1989
|
||||||
|
paper "Partitioning and Scheduling Parallel Programs for Multiprocessors".
|
||||||
|
|
||||||
|
Let's define some terms:
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item C<Par Factor>
|
||||||
|
|
||||||
|
The available parallelism or "par-factor" of a DAG is the total cost to
|
||||||
|
execute all nodes, divided by the cost to execute the longest critical path
|
||||||
|
through the graph. This is the speedup you would get from running the graph
|
||||||
|
in parallel, if given infinite CPU cores available and communication and
|
||||||
|
synchronization are zero.
|
||||||
|
|
||||||
|
=item C<Macro Task>
|
||||||
|
|
||||||
|
When the partitioner coarsens the graph, it combines nodes together. Each
|
||||||
|
fine-grained node represents an atomic "task"; combined nodes in the
|
||||||
|
coarsened graph are "macro-tasks". This term comes from Sarkar. Each
|
||||||
|
macro-task executes from start to end on one processor, without any
|
||||||
|
synchronization to any other macro-task during its
|
||||||
|
execution. (Synchronization only happens before the macro-task begins or
|
||||||
|
after it ends.)
|
||||||
|
|
||||||
|
=item C<Edge Contraction>
|
||||||
|
|
||||||
|
Our partitioner, like Sarkar's, primarily relies on "edge contraction" to
|
||||||
|
coarsen the graph. It starts with one macro-task per atomic task and
|
||||||
|
iteratively combines pairs of edge-connected macro-tasks.
|
||||||
|
|
||||||
|
=item C<Local Critical Path>
|
||||||
|
|
||||||
|
Each node in the graph has a "local" critical path. That's the critical
|
||||||
|
path from the start of the graph to the start of the node, plus the node's
|
||||||
|
cost, plus the critical path from the end of the node to the end of the
|
||||||
|
graph.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
Sarkar calls out an important trade-off: coarsening the graph reduces
|
||||||
|
runtime synchronization overhead among the macro-tasks, but it tends to
|
||||||
|
increase the critical path through the graph and thus reduces par-factor.
|
||||||
|
|
||||||
|
Sarkar's partitioner, and ours, chooses pairs of macro-tasks to merge such
|
||||||
|
that the growth in critical path is minimized. Each candidate merge would
|
||||||
|
result in a new node, which would have some local critical path. We choose
|
||||||
|
the candidate that would produce the shortest local critical path. Repeat
|
||||||
|
until par-factor falls to a target threshold. It's a greedy algorithm, and
|
||||||
|
it's not guaranteed to produce the best partition (which Sarkar proves is
|
||||||
|
NP-hard).
|
||||||
|
|
||||||
|
=head3 Estimating Logic Costs
|
||||||
|
|
||||||
|
To compute the cost of any given path through the graph, Verilator
|
||||||
|
estimates an execution cost for each task. Each macro-task has an execution
|
||||||
|
cost which is simply the sum of its tasks' costs. We assume that
|
||||||
|
communication overhead and synchronization overhead are zero, so the cost
|
||||||
|
of any given path through the graph is simply the sum of macro-task
|
||||||
|
execution costs. Sarkar does almost the same thing, except that he has
|
||||||
|
nonzero estimates for synchronization costs.
|
||||||
|
|
||||||
|
Verilator's cost estimates are assigned by the InstrCountCostVisitor. This
|
||||||
|
class is perhaps the most fragile piece of the multithread implementation.
|
||||||
|
It's easy to have a bug where you count something cheap (eg. accessing one
|
||||||
|
element of a huge array) as if it were expensive (eg. by counting it as if
|
||||||
|
it were an access to the entire array.) Even without such gross bugs, the
|
||||||
|
estimates this produce are only loosely predictive of actual runtime cost.
|
||||||
|
Multithread performance would be better with better runtime costs
|
||||||
|
estimates. This is an area to improve.
|
||||||
|
|
||||||
|
=head3 Scheduling Macro-Tasks at Runtime
|
||||||
|
|
||||||
|
After coarsening the graph, we must schedule the macro-tasks for runtime.
|
||||||
|
Sarkar describes two options: you can dynamically schedule tasks at
|
||||||
|
runtime, with a runtime graph follower. Sarkar calls this the
|
||||||
|
"macro-dataflow model." Verilator does not support this; early experiments
|
||||||
|
with this approach had poor performance.
|
||||||
|
|
||||||
|
The other option is to statically assign macro-tasks to threads, with each
|
||||||
|
thread running its macro-tasks in a static order. Sarkar describes this in
|
||||||
|
Chapter 5. Verilator takes this static approach. The only dynamic aspect is
|
||||||
|
that each macro task may block before starting, to wait until its
|
||||||
|
prerequisites on other threads have finished.
|
||||||
|
|
||||||
|
The synchronization cost is cheap if the prereqs are done. If they're not,
|
||||||
|
fragmentation (idle CPU cores waiting) is possible. This is the major
|
||||||
|
source of overhead in this approach. The --prof-threads switch and the
|
||||||
|
C<verilator_gantt> script can visualize the time lost to such
|
||||||
|
fragmentation.
|
||||||
|
|
||||||
|
=head3 Locating Variables for Best Spatial Locality
|
||||||
|
|
||||||
|
After scheduling all code, we attempt to locate variables in memory such
|
||||||
|
that variables accessed by a single macro-task are close together in
|
||||||
|
memory. This provides "spatial locality" -- when we pull in a 64-byte
|
||||||
|
cache line to access a 2-byte variable, we want the other 62 bytes to be
|
||||||
|
ones we'll also likely access soon, for best cache performance.
|
||||||
|
|
||||||
|
This turns out to be critical for performance. It should allow Verilator
|
||||||
|
to scale to very large models. We don't rely on our working set fitting
|
||||||
|
in any CPU cache; instead we essentially "stream" data into caches from
|
||||||
|
memory. It's not literally streaming, where the address increases
|
||||||
|
monotonically, but it should have similar performance characteristics,
|
||||||
|
so long as each macro-task's dataset fits in one core's local caches.
|
||||||
|
|
||||||
|
To achieve spatial locality, we tag each variable with the set of
|
||||||
|
macro-tasks that access it. Let's call this set the "footprint" of that
|
||||||
|
variable. The variables in a given module have a set of footprints. We can
|
||||||
|
order those footprints to minimize the distance between them (distance is
|
||||||
|
the number of macro-tasks that are different across any two footprints) and
|
||||||
|
then emit all variables into the struct in ordered-footprint order.
|
||||||
|
|
||||||
|
The footprint ordering is literally the traveling salesman problem, and we
|
||||||
|
use a TSP-approximation algorithm to get close to an optimal sort.
|
||||||
|
|
||||||
|
This is an old idea. Simulators designed at DEC in the early 1990s used
|
||||||
|
similar techniques to optimize both single-thread and multi-thread modes.
|
||||||
|
(Verilator does not optimize variable placement for spatial locality in
|
||||||
|
serial mode; that is a possible area for improvement.)
|
||||||
|
|
||||||
|
=head3 Improving Multithreaded Performance Further (a TODO list)
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item C<Wave Scheduling>
|
||||||
|
|
||||||
|
To allow the verilated model to run in parallel with the testbench, it
|
||||||
|
might be nice to support "wave" scheduling, in which work on a cycle begins
|
||||||
|
before eval() is called or continues after eval() returns. For now all
|
||||||
|
work on a cycle happens during the eval() call, leaving Verilator's threads
|
||||||
|
idle while the testbench (everything outside eval()) is working. This would
|
||||||
|
involve fundamental changes within the partitioner, however, it's probably
|
||||||
|
the best bet for hiding testbench latency.
|
||||||
|
|
||||||
|
=item C<Efficient Dynamic Scheduling>
|
||||||
|
|
||||||
|
To scale to more than a few threads, we may revisit a fully dynamic
|
||||||
|
scheduler. For large (>16 core) systems it might make sense to dedicate an
|
||||||
|
entire core to scheduling, so that scheduler data structures would fit in
|
||||||
|
its L1 cache and thus the cost of traversing priority-ordered ready lists
|
||||||
|
would not be prohibitive.
|
||||||
|
|
||||||
|
=item C<Static Scheduling with Runtime Repack>
|
||||||
|
|
||||||
|
We could modify the static scheduling approach by gathering actual
|
||||||
|
macro-task execution times at run time, and dynamically re-packing the
|
||||||
|
macro-tasks into the threads also at run time. Say, re-pack once every
|
||||||
|
10,000 cycles or something. This has the potential to do better than our
|
||||||
|
static estimates about macro-task run times. It could potentially react to
|
||||||
|
CPU cores that aren't performing equally, due to NUMA or thermal throttling
|
||||||
|
or nonuniform competing memory traffic or whatever.
|
||||||
|
|
||||||
|
=item C<Clock Domain Balancing>
|
||||||
|
|
||||||
|
Right now Verilator makes no attempt to balance clock domains across
|
||||||
|
macro-tasks. For a multi-domain model, that could lead to bad gantt chart
|
||||||
|
fragmentation. This could be improved if it's a real problem in practice.
|
||||||
|
|
||||||
|
=item C<Other Forms of MTask Balancing>
|
||||||
|
|
||||||
|
The largest source of runtime overhead is idle CPUs, which happens due to
|
||||||
|
variance between our predicted runtime for each MTask and its actual
|
||||||
|
runtime. That variance is magnified if MTasks are homogeneous, containing
|
||||||
|
similar repeating logic which was generally close together in source code
|
||||||
|
and which is still packed together even after going through Verilator's
|
||||||
|
digestive tract.
|
||||||
|
|
||||||
|
If Verilator could avoid doing that, and instead would take source logic
|
||||||
|
that was close together and distribute it across MTasks, that would
|
||||||
|
increase the diversity of any given MTask, and this should reduce variance
|
||||||
|
in the cost estimates.
|
||||||
|
|
||||||
|
One way to do that might be to make various "tie breaker" comparison
|
||||||
|
routines in the sources to rely more heavily on randomness, and generally
|
||||||
|
try harder not to keep input nodes together when we have the option to
|
||||||
|
scramble things.
|
||||||
|
|
||||||
|
=item C<Performance Regression>
|
||||||
|
|
||||||
|
It would be nice if we had a regression of large designs, with some
|
||||||
|
diversity of design styles, to test on both single- and multi-threaded
|
||||||
|
modes. This would help to avoid performance regressions, and also to
|
||||||
|
evaluate the optimizations while minimizing the impact of parasitic noise.
|
||||||
|
|
||||||
|
=item C<Per-Instance Classes>
|
||||||
|
|
||||||
|
If we have multiple instances of the same module, and they partition
|
||||||
|
differently (likely; we make no attempt to partition them the same) then
|
||||||
|
the variable sort will be suboptimal for either instance. A possible
|
||||||
|
improvement would be to emit a unique class for each instance of a module,
|
||||||
|
and sort its variables optimally for that instance's code stream.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
=head2 Verilated Flow
|
=head2 Verilated Flow
|
||||||
|
|
||||||
The evaluation loop outputted by Verilator is designed to allow a single
|
The evaluation loop outputted by Verilator is designed to allow a single
|
||||||
|
|
|
||||||
|
|
@ -64,6 +64,7 @@ sub test {
|
||||||
run("test -e $prefix/bin/verilator");
|
run("test -e $prefix/bin/verilator");
|
||||||
run("test -e $prefix/bin/verilator_bin");
|
run("test -e $prefix/bin/verilator_bin");
|
||||||
run("test -e $prefix/bin/verilator_bin_dbg");
|
run("test -e $prefix/bin/verilator_bin_dbg");
|
||||||
|
run("test -e $prefix/bin/verilator_gantt");
|
||||||
run("test -e $prefix/bin/verilator_profcfunc");
|
run("test -e $prefix/bin/verilator_profcfunc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -217,6 +217,7 @@ RAW_OBJS = \
|
||||||
V3Order.o \
|
V3Order.o \
|
||||||
V3Os.o \
|
V3Os.o \
|
||||||
V3Param.o \
|
V3Param.o \
|
||||||
|
V3Partition.o \
|
||||||
V3PreShell.o \
|
V3PreShell.o \
|
||||||
V3Premit.o \
|
V3Premit.o \
|
||||||
V3Reloop.o \
|
V3Reloop.o \
|
||||||
|
|
|
||||||
|
|
@ -29,16 +29,24 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include VL_INCLUDE_UNORDERED_SET
|
||||||
|
|
||||||
#include "V3Ast__gen_classes.h" // From ./astgen
|
#include "V3Ast__gen_classes.h" // From ./astgen
|
||||||
// Things like:
|
// Things like:
|
||||||
// class V3AstNode;
|
// class V3AstNode;
|
||||||
|
|
||||||
|
// Forward declarations
|
||||||
|
class V3Graph;
|
||||||
|
class ExecMTask;
|
||||||
|
|
||||||
// Hint class so we can choose constructors
|
// Hint class so we can choose constructors
|
||||||
class VFlagLogicPacked {};
|
class VFlagLogicPacked {};
|
||||||
class VFlagBitPacked {};
|
class VFlagBitPacked {};
|
||||||
class VFlagChildDType {}; // Used by parser.y to select constructor that sets childDType
|
class VFlagChildDType {}; // Used by parser.y to select constructor that sets childDType
|
||||||
|
|
||||||
|
// Used as key for another map, needs operator<, hence not an unordered_set
|
||||||
|
typedef std::set<int> MTaskIdSet; // Set of mtaskIds for Var sorting
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
|
|
||||||
// For broken() function, return error string if have a match
|
// For broken() function, return error string if have a match
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,8 @@
|
||||||
#include "V3Ast.h"
|
#include "V3Ast.h"
|
||||||
#include "V3File.h"
|
#include "V3File.h"
|
||||||
#include "V3Global.h"
|
#include "V3Global.h"
|
||||||
|
#include "V3Graph.h"
|
||||||
|
#include "V3PartitionGraph.h" // Just for mtask dumping
|
||||||
|
|
||||||
//======================================================================
|
//======================================================================
|
||||||
// Special methods
|
// Special methods
|
||||||
|
|
@ -151,22 +153,26 @@ AstNodeBiop* AstEqWild::newTyped(FileLine* fl, AstNode* lhsp, AstNode* rhsp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AstExecGraph::AstExecGraph(FileLine* fileline)
|
||||||
|
: AstNode(fileline) {
|
||||||
|
m_depGraphp = new V3Graph;
|
||||||
|
}
|
||||||
|
AstExecGraph::~AstExecGraph() {
|
||||||
|
delete m_depGraphp; VL_DANGLING(m_depGraphp);
|
||||||
|
}
|
||||||
|
|
||||||
bool AstVar::isSigPublic() const {
|
bool AstVar::isSigPublic() const {
|
||||||
return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
|
return (m_sigPublic || (v3Global.opt.allPublic() && !isTemp() && !isGenVar()));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AstVar::isScQuad() const {
|
bool AstVar::isScQuad() const {
|
||||||
return (isSc() && isQuad() && !isScBv() && !isScBigUint());
|
return (isSc() && isQuad() && !isScBv() && !isScBigUint());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AstVar::isScBv() const {
|
bool AstVar::isScBv() const {
|
||||||
return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
|
return ((isSc() && width() >= v3Global.opt.pinsBv()) || m_attrScBv);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AstVar::isScUint() const {
|
bool AstVar::isScUint() const {
|
||||||
return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
|
return ((isSc() && v3Global.opt.pinsScUint() && width() >= 2 && width() <= 64) && !isScBv());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AstVar::isScBigUint() const {
|
bool AstVar::isScBigUint() const {
|
||||||
return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
|
return ((isSc() && v3Global.opt.pinsScBigUint() && width() >= 65 && width() <= 512) && !isScBv());
|
||||||
}
|
}
|
||||||
|
|
@ -441,6 +447,16 @@ AstVar* AstVar::scVarRecurse(AstNode* nodep) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string AstVar::mtasksString() const {
|
||||||
|
std::ostringstream os;
|
||||||
|
os<<" all: ";
|
||||||
|
for (MTaskIdSet::const_iterator it = m_mtaskIds.begin();
|
||||||
|
it != m_mtaskIds.end(); ++it) {
|
||||||
|
os<<*it<<" ";
|
||||||
|
}
|
||||||
|
return os.str();
|
||||||
|
}
|
||||||
|
|
||||||
AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
|
AstNodeDType* AstNodeDType::dtypeDimensionp(int dimension) {
|
||||||
// dimension passed from AstArraySel::dimension
|
// dimension passed from AstArraySel::dimension
|
||||||
// Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
|
// Dimension 0 means the VAR itself, 1 is the closest SEL to the AstVar,
|
||||||
|
|
@ -970,6 +986,11 @@ void AstSliceSel::dump(std::ostream& str) {
|
||||||
str<<" decl"<<declRange();
|
str<<" decl"<<declRange();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void AstMTaskBody::dump(std::ostream& str) {
|
||||||
|
this->AstNode::dump(str);
|
||||||
|
str<<" ";
|
||||||
|
m_execMTaskp->dump(str);
|
||||||
|
}
|
||||||
void AstTypeTable::dump(std::ostream& str) {
|
void AstTypeTable::dump(std::ostream& str) {
|
||||||
this->AstNode::dump(str);
|
this->AstNode::dump(str);
|
||||||
for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
|
for (int i=0; i<(int)(AstBasicDTypeKwd::_ENUM_MAX); ++i) {
|
||||||
|
|
|
||||||
|
|
@ -1124,6 +1124,7 @@ private:
|
||||||
bool m_noSubst:1; // Do not substitute out references
|
bool m_noSubst:1; // Do not substitute out references
|
||||||
bool m_trace:1; // Trace this variable
|
bool m_trace:1; // Trace this variable
|
||||||
AstVarAttrClocker m_attrClocker;
|
AstVarAttrClocker m_attrClocker;
|
||||||
|
MTaskIdSet m_mtaskIds; // MTaskID's that read or write this var
|
||||||
|
|
||||||
void init() {
|
void init() {
|
||||||
m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
|
m_input=false; m_output=false; m_tristate=false; m_declOutput=false;
|
||||||
|
|
@ -1323,6 +1324,10 @@ public:
|
||||||
if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
|
if (varType()==AstVarType::INPUT || varType()==AstVarType::OUTPUT) m_varType = AstVarType::WIRE;
|
||||||
}
|
}
|
||||||
static AstVar* scVarRecurse(AstNode* nodep);
|
static AstVar* scVarRecurse(AstNode* nodep);
|
||||||
|
void addProducingMTaskId(int id) { m_mtaskIds.insert(id); }
|
||||||
|
void addConsumingMTaskId(int id) { m_mtaskIds.insert(id); }
|
||||||
|
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
|
||||||
|
string mtasksString() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
class AstDefParam : public AstNode {
|
class AstDefParam : public AstNode {
|
||||||
|
|
@ -5698,6 +5703,44 @@ public:
|
||||||
AstNode* bodysp() const { return op1p(); } // op1= expressions to print
|
AstNode* bodysp() const { return op1p(); } // op1= expressions to print
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class AstMTaskBody : public AstNode {
|
||||||
|
// Hold statements for each MTask
|
||||||
|
private:
|
||||||
|
ExecMTask* m_execMTaskp;
|
||||||
|
public:
|
||||||
|
explicit AstMTaskBody(FileLine* flp)
|
||||||
|
: AstNode(flp)
|
||||||
|
, m_execMTaskp(NULL) {}
|
||||||
|
ASTNODE_NODE_FUNCS(MTaskBody);
|
||||||
|
virtual const char* broken() const { BROKEN_RTN(!m_execMTaskp); return NULL; }
|
||||||
|
AstNode* stmtsp() const { return op1p(); }
|
||||||
|
void addStmtsp(AstNode* nodep) { addOp1p(nodep); }
|
||||||
|
ExecMTask* execMTaskp() const { return m_execMTaskp; }
|
||||||
|
void execMTaskp(ExecMTask* execMTaskp) { m_execMTaskp = execMTaskp; }
|
||||||
|
virtual void dump(std::ostream& str=std::cout);
|
||||||
|
};
|
||||||
|
|
||||||
|
class AstExecGraph : public AstNode {
|
||||||
|
// For parallel execution, this node contains a dependency graph. Each
|
||||||
|
// node in the graph is an ExecMTask, which contains a body for the
|
||||||
|
// mtask, which contains a set of AstActive's, each of which calls a
|
||||||
|
// leaf AstCFunc. whew!
|
||||||
|
//
|
||||||
|
// The mtask bodies are also children of this node, so we can visit
|
||||||
|
// them without traversing the graph (it's not always needed to
|
||||||
|
// traverse the graph.)
|
||||||
|
private:
|
||||||
|
V3Graph *m_depGraphp; // contains ExecMTask's
|
||||||
|
public:
|
||||||
|
explicit AstExecGraph(FileLine* fileline);
|
||||||
|
ASTNODE_NODE_FUNCS_NO_DTOR(ExecGraph)
|
||||||
|
virtual ~AstExecGraph();
|
||||||
|
virtual const char* broken() const { BROKEN_RTN(!m_depGraphp); return NULL; }
|
||||||
|
const V3Graph* depGraphp() const { return m_depGraphp; }
|
||||||
|
V3Graph* mutableDepGraphp() { return m_depGraphp; }
|
||||||
|
void addMTaskBody(AstMTaskBody* bodyp) { addOp1p(bodyp); }
|
||||||
|
};
|
||||||
|
|
||||||
class AstSplitPlaceholder : public AstNode {
|
class AstSplitPlaceholder : public AstNode {
|
||||||
public:
|
public:
|
||||||
// Dummy node used within V3Split; never exists outside of V3Split.
|
// Dummy node used within V3Split; never exists outside of V3Split.
|
||||||
|
|
@ -5749,12 +5792,14 @@ private:
|
||||||
AstTypeTable* m_typeTablep; // Reference to top type table, for faster lookup
|
AstTypeTable* m_typeTablep; // Reference to top type table, for faster lookup
|
||||||
AstPackage* m_dollarUnitPkgp;
|
AstPackage* m_dollarUnitPkgp;
|
||||||
AstCFunc* m_evalp; // The '_eval' function
|
AstCFunc* m_evalp; // The '_eval' function
|
||||||
|
AstExecGraph* m_execGraphp; // Execution MTask graph for threads>1 mode
|
||||||
public:
|
public:
|
||||||
AstNetlist()
|
AstNetlist()
|
||||||
: AstNode(new FileLine("AstRoot",0))
|
: AstNode(new FileLine("AstRoot",0))
|
||||||
, m_typeTablep(NULL)
|
, m_typeTablep(NULL)
|
||||||
, m_dollarUnitPkgp(NULL)
|
, m_dollarUnitPkgp(NULL)
|
||||||
, m_evalp(NULL) { }
|
, m_evalp(NULL)
|
||||||
|
, m_execGraphp(NULL) { }
|
||||||
ASTNODE_NODE_FUNCS(Netlist)
|
ASTNODE_NODE_FUNCS(Netlist)
|
||||||
virtual const char* broken() const {
|
virtual const char* broken() const {
|
||||||
BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
|
BROKEN_RTN(m_dollarUnitPkgp && !m_dollarUnitPkgp->brokeExists());
|
||||||
|
|
@ -5784,6 +5829,8 @@ public:
|
||||||
return m_dollarUnitPkgp; }
|
return m_dollarUnitPkgp; }
|
||||||
AstCFunc* evalp() const { return m_evalp; }
|
AstCFunc* evalp() const { return m_evalp; }
|
||||||
void evalp(AstCFunc* evalp) { m_evalp = evalp; }
|
void evalp(AstCFunc* evalp) { m_evalp = evalp; }
|
||||||
|
AstExecGraph* execGraphp() const { return m_execGraphp; }
|
||||||
|
void execGraphp(AstExecGraph* graphp) { m_execGraphp = graphp; }
|
||||||
};
|
};
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@ private:
|
||||||
AstCFunc* m_settleFuncp; // Top settlement function we are creating
|
AstCFunc* m_settleFuncp; // Top settlement function we are creating
|
||||||
AstSenTree* m_lastSenp; // Last sensitivity match, so we can detect duplicates.
|
AstSenTree* m_lastSenp; // Last sensitivity match, so we can detect duplicates.
|
||||||
AstIf* m_lastIfp; // Last sensitivity if active to add more under
|
AstIf* m_lastIfp; // Last sensitivity if active to add more under
|
||||||
|
AstMTaskBody* m_mtaskBodyp; // Current mtask body
|
||||||
|
|
||||||
// METHODS
|
// METHODS
|
||||||
VL_DEBUG_FUNC; // Declare debug()
|
VL_DEBUG_FUNC; // Declare debug()
|
||||||
|
|
@ -338,6 +339,30 @@ private:
|
||||||
// Only empty blocks should be leftover on the non-top. Killem.
|
// Only empty blocks should be leftover on the non-top. Killem.
|
||||||
if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
|
if (nodep->stmtsp()) nodep->v3fatalSrc("Non-empty lower active");
|
||||||
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||||
|
} else if (m_mtaskBodyp) {
|
||||||
|
UINFO(4," TR ACTIVE "<<nodep<<endl);
|
||||||
|
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
|
||||||
|
if (nodep->hasClocked()) {
|
||||||
|
if (nodep->hasInitial()) nodep->v3fatalSrc("Initial block should not have clock sensitivity");
|
||||||
|
if (m_lastSenp && nodep->sensesp()->sameTree(m_lastSenp)) {
|
||||||
|
UINFO(4," sameSenseTree\n");
|
||||||
|
} else {
|
||||||
|
clearLastSen();
|
||||||
|
m_lastSenp = nodep->sensesp();
|
||||||
|
// Make a new if statement
|
||||||
|
m_lastIfp = makeActiveIf(m_lastSenp);
|
||||||
|
m_mtaskBodyp->addStmtsp(m_lastIfp);
|
||||||
|
}
|
||||||
|
// Move statements to if
|
||||||
|
m_lastIfp->addIfsp(stmtsp);
|
||||||
|
} else if (nodep->hasInitial() || nodep->hasSettle()) {
|
||||||
|
nodep->v3fatalSrc("MTask should not include initial/settle logic.");
|
||||||
|
} else {
|
||||||
|
// Combo logic. Move statements to mtask func.
|
||||||
|
clearLastSen();
|
||||||
|
m_mtaskBodyp->addStmtsp(stmtsp);
|
||||||
|
}
|
||||||
|
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||||
} else {
|
} else {
|
||||||
UINFO(4," ACTIVE "<<nodep<<endl);
|
UINFO(4," ACTIVE "<<nodep<<endl);
|
||||||
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
|
AstNode* stmtsp = nodep->stmtsp()->unlinkFrBackWithNext();
|
||||||
|
|
@ -372,6 +397,20 @@ private:
|
||||||
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
nodep->unlinkFrBack()->deleteTree(); VL_DANGLING(nodep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
virtual void visit(AstExecGraph* nodep) {
|
||||||
|
for (m_mtaskBodyp = VN_CAST(nodep->op1p(), MTaskBody);
|
||||||
|
m_mtaskBodyp;
|
||||||
|
m_mtaskBodyp = VN_CAST(m_mtaskBodyp->nextp(), MTaskBody)) {
|
||||||
|
clearLastSen();
|
||||||
|
iterate(m_mtaskBodyp);
|
||||||
|
}
|
||||||
|
clearLastSen();
|
||||||
|
// Move the ExecGraph into _eval. Its location marks the
|
||||||
|
// spot where the graph will execute, relative to other
|
||||||
|
// (serial) logic in the cycle.
|
||||||
|
nodep->unlinkFrBack();
|
||||||
|
addToEvalLoop(nodep);
|
||||||
|
}
|
||||||
|
|
||||||
//--------------------
|
//--------------------
|
||||||
// Default: Just iterate
|
// Default: Just iterate
|
||||||
|
|
@ -391,6 +430,7 @@ public:
|
||||||
m_lastSenp = NULL;
|
m_lastSenp = NULL;
|
||||||
m_lastIfp = NULL;
|
m_lastIfp = NULL;
|
||||||
m_scopep = NULL;
|
m_scopep = NULL;
|
||||||
|
m_mtaskBodyp = NULL;
|
||||||
//
|
//
|
||||||
iterate(nodep);
|
iterate(nodep);
|
||||||
// Allow downstream modules to find _eval()
|
// Allow downstream modules to find _eval()
|
||||||
|
|
|
||||||
432
src/V3EmitC.cpp
432
src/V3EmitC.cpp
|
|
@ -34,6 +34,8 @@
|
||||||
#include "V3EmitC.h"
|
#include "V3EmitC.h"
|
||||||
#include "V3EmitCBase.h"
|
#include "V3EmitCBase.h"
|
||||||
#include "V3Number.h"
|
#include "V3Number.h"
|
||||||
|
#include "V3PartitionGraph.h"
|
||||||
|
#include "V3TSP.h"
|
||||||
|
|
||||||
#define VL_VALUE_STRING_MAX_WIDTH 8192 // We use a static char array in VL_VALUE_STRING
|
#define VL_VALUE_STRING_MAX_WIDTH 8192 // We use a static char array in VL_VALUE_STRING
|
||||||
|
|
||||||
|
|
@ -103,7 +105,13 @@ public:
|
||||||
puts("["+cvtToStr(arrayp->elementsConst())+"]");
|
puts("["+cvtToStr(arrayp->elementsConst())+"]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void emitVarCmtChg(const AstVar* varp, string* curVarCmtp) {
|
||||||
|
string newVarCmt = varp->mtasksString();
|
||||||
|
if (*curVarCmtp != newVarCmt) {
|
||||||
|
*curVarCmtp = newVarCmt;
|
||||||
|
puts("// Begin mtask footprint "+*curVarCmtp+"\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
void emitTypedefs(AstNode* firstp) {
|
void emitTypedefs(AstNode* firstp) {
|
||||||
bool first = true;
|
bool first = true;
|
||||||
for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
|
for (AstNode* loopp=firstp; loopp; loopp = loopp->nextp()) {
|
||||||
|
|
@ -783,6 +791,50 @@ public:
|
||||||
virtual ~EmitCStmts() {}
|
virtual ~EmitCStmts() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//######################################################################
|
||||||
|
// Establish mtask variable sort order in mtasks mode
|
||||||
|
|
||||||
|
class EmitVarTspSorter : public V3TSP::TspStateBase {
|
||||||
|
private:
|
||||||
|
// MEMBERS
|
||||||
|
const MTaskIdSet& m_mtaskIds; // Mtask we're ordering
|
||||||
|
static unsigned m_serialNext; // Unique ID to establish serial order
|
||||||
|
unsigned m_serial; // Serial ordering
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
explicit EmitVarTspSorter(const MTaskIdSet& mtaskIds)
|
||||||
|
: m_mtaskIds(mtaskIds),
|
||||||
|
m_serial(++m_serialNext) {}
|
||||||
|
virtual ~EmitVarTspSorter() {}
|
||||||
|
// METHODS
|
||||||
|
bool operator<(const TspStateBase& other) const {
|
||||||
|
return operator<(dynamic_cast<const EmitVarTspSorter&>(other));
|
||||||
|
}
|
||||||
|
bool operator<(const EmitVarTspSorter& other) const {
|
||||||
|
return m_serial < other.m_serial;
|
||||||
|
}
|
||||||
|
const MTaskIdSet& mtaskIds() const { return m_mtaskIds; }
|
||||||
|
virtual int cost(const TspStateBase* otherp) const {
|
||||||
|
return cost(dynamic_cast<const EmitVarTspSorter*>(otherp));
|
||||||
|
}
|
||||||
|
virtual int cost(const EmitVarTspSorter* otherp) const {
|
||||||
|
int cost = diffs(m_mtaskIds, otherp->m_mtaskIds);
|
||||||
|
cost += diffs(otherp->m_mtaskIds, m_mtaskIds);
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
// Returns the number of elements in set_a that don't appear in set_b
|
||||||
|
static int diffs(const MTaskIdSet& set_a, const MTaskIdSet& set_b) {
|
||||||
|
int diffs = 0;
|
||||||
|
for (MTaskIdSet::iterator it = set_a.begin();
|
||||||
|
it != set_a.end(); ++it) {
|
||||||
|
if (set_b.find(*it) == set_b.end()) ++diffs;
|
||||||
|
}
|
||||||
|
return diffs;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned EmitVarTspSorter::m_serialNext = 0;
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
// Internal EmitC implementation
|
// Internal EmitC implementation
|
||||||
|
|
||||||
|
|
@ -873,6 +925,91 @@ class EmitCImp : EmitCStmts {
|
||||||
return ofp;
|
return ofp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the number of cross-thread dependencies into mtaskp.
|
||||||
|
// If >0, mtaskp must test whether its prereqs are done before starting,
|
||||||
|
// and may need to block.
|
||||||
|
static uint32_t packedMTaskMayBlock(const ExecMTask* mtaskp) {
|
||||||
|
uint32_t result = 0;
|
||||||
|
for (V3GraphEdge* edgep = mtaskp->inBeginp(); edgep; edgep = edgep->inNextp()) {
|
||||||
|
const ExecMTask* prevp = dynamic_cast<ExecMTask*>(edgep->fromp());
|
||||||
|
if (prevp->thread() != mtaskp->thread()) {
|
||||||
|
++result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void emitMTaskBody(AstMTaskBody* nodep) {
|
||||||
|
ExecMTask* curExecMTaskp = nodep->execMTaskp();
|
||||||
|
if (packedMTaskMayBlock(curExecMTaskp)) {
|
||||||
|
puts("vlTOPp->__Vm_mt_" + cvtToStr(curExecMTaskp->id())
|
||||||
|
+ ".waitUntilUpstreamDone(even_cycle);\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
string recName;
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
recName = "__Vprfthr_" + cvtToStr(curExecMTaskp->id());
|
||||||
|
puts("VlProfileRec* " + recName + " = NULL;\n");
|
||||||
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||||
|
puts("if (VL_UNLIKELY(vlTOPp->__Vm_profile_cycle_start)) {\n");
|
||||||
|
puts( recName + " = vlTOPp->__Vm_threadPoolp->profileAppend();\n");
|
||||||
|
puts( recName + "->startRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start,");
|
||||||
|
puts( " "+cvtToStr(curExecMTaskp->id())+ ",");
|
||||||
|
puts( " "+cvtToStr(curExecMTaskp->cost())+");\n");
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
puts("Verilated::mtaskId(" + cvtToStr(curExecMTaskp->id()) + ");\n");
|
||||||
|
|
||||||
|
// The actual body of calls to leaf functions
|
||||||
|
iterateAndNextNull(nodep->stmtsp());
|
||||||
|
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
// Leave this if() here, as don't want to call VL_RDTSC_Q unless profiling
|
||||||
|
puts("if (VL_UNLIKELY("+recName+")) {\n");
|
||||||
|
puts( recName + "->endRecord(VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start);\n");
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flush message queue
|
||||||
|
puts("Verilated::endOfThreadMTask(vlSymsp->__Vm_evalMsgQp);\n");
|
||||||
|
|
||||||
|
// For any downstream mtask that's on another thread, bump its
|
||||||
|
// counter and maybe notify it.
|
||||||
|
for (V3GraphEdge* edgep = curExecMTaskp->outBeginp();
|
||||||
|
edgep; edgep = edgep->outNextp()) {
|
||||||
|
const ExecMTask* nextp = dynamic_cast<ExecMTask*>(edgep->top());
|
||||||
|
if (nextp->thread() != curExecMTaskp->thread()) {
|
||||||
|
puts("vlTOPp->__Vm_mt_"+cvtToStr(nextp->id())
|
||||||
|
+ ".signalUpstreamDone(even_cycle);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the next mtask inline
|
||||||
|
const ExecMTask* nextp = curExecMTaskp->packNextp();
|
||||||
|
if (nextp) {
|
||||||
|
emitMTaskBody(nextp->bodyp());
|
||||||
|
} else {
|
||||||
|
// Unblock the fake "final" mtask
|
||||||
|
puts("vlTOPp->__Vm_mt_final.signalUpstreamDone(even_cycle);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void visit(AstMTaskBody* nodep) {
|
||||||
|
ExecMTask* mtp = nodep->execMTaskp();
|
||||||
|
puts("\n");
|
||||||
|
puts("void ");
|
||||||
|
puts(modClassName(m_modp)+"::"+mtp->cFuncName());
|
||||||
|
puts("(bool even_cycle, void* symtab) {\n");
|
||||||
|
|
||||||
|
// Declare and set vlSymsp
|
||||||
|
puts(EmitCBaseVisitor::symClassVar() + " = ("
|
||||||
|
+ EmitCBaseVisitor::symClassName() + "*)symtab;\n");
|
||||||
|
puts(EmitCBaseVisitor::symTopAssign()+"\n");
|
||||||
|
|
||||||
|
emitMTaskBody(nodep);
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
//---------------------------------------
|
//---------------------------------------
|
||||||
// VISITORS
|
// VISITORS
|
||||||
using EmitCStmts::visit; // Suppress hidden overloaded virtual function warning
|
using EmitCStmts::visit; // Suppress hidden overloaded virtual function warning
|
||||||
|
|
@ -973,6 +1110,54 @@ class EmitCImp : EmitCStmts {
|
||||||
emitVarReset(varp);
|
emitVarReset(varp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void visit(AstExecGraph* nodep) {
|
||||||
|
if (nodep != v3Global.rootp()->execGraphp()) {
|
||||||
|
nodep->v3fatalSrc("ExecGraph should be a singleton!");
|
||||||
|
}
|
||||||
|
// The location of the AstExecGraph within the containing _eval()
|
||||||
|
// function is where we want to invoke the graph and wait for it to
|
||||||
|
// complete. Do that now.
|
||||||
|
//
|
||||||
|
// Don't recurse to children -- this isn't the place to emit
|
||||||
|
// function definitions for the nested CFuncs. We'll do that at the
|
||||||
|
// end.
|
||||||
|
puts("vlTOPp->__Vm_even_cycle = !vlTOPp->__Vm_even_cycle;\n");
|
||||||
|
|
||||||
|
// Build the list of initial mtasks to start
|
||||||
|
std::vector<const ExecMTask*> execMTasks;
|
||||||
|
|
||||||
|
// Start each root mtask
|
||||||
|
for (const V3GraphVertex* vxp = nodep->depGraphp()->verticesBeginp();
|
||||||
|
vxp; vxp = vxp->verticesNextp()) {
|
||||||
|
const ExecMTask* etp = dynamic_cast<const ExecMTask*>(vxp);
|
||||||
|
if (etp->threadRoot()) execMTasks.push_back(etp);
|
||||||
|
}
|
||||||
|
if (execMTasks.size() >
|
||||||
|
static_cast<unsigned>(v3Global.opt.threads())) {
|
||||||
|
nodep->v3fatalSrc("More root mtasks than available threads");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!execMTasks.empty()) {
|
||||||
|
for (uint32_t i = 0; i < execMTasks.size(); ++i) {
|
||||||
|
bool runInline = (i == execMTasks.size() - 1);
|
||||||
|
if (runInline) {
|
||||||
|
// The thread calling eval() will run this mtask inline,
|
||||||
|
// along with its packed successors.
|
||||||
|
puts(execMTasks[i]->cFuncName()
|
||||||
|
+ "(vlTOPp->__Vm_even_cycle, vlSymsp);\n");
|
||||||
|
puts("Verilated::mtaskId(0);\n");
|
||||||
|
} else {
|
||||||
|
// The other N-1 go to the thread pool.
|
||||||
|
puts("vlTOPp->__Vm_threadPoolp->workerp("
|
||||||
|
+ cvtToStr(i)+")->addTask("
|
||||||
|
+ execMTasks[i]->cFuncName()
|
||||||
|
+ ", vlTOPp->__Vm_even_cycle, vlSymsp);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
puts("vlTOPp->__Vm_mt_final.waitUntilUpstreamDone(vlTOPp->__Vm_even_cycle);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//---------------------------------------
|
//---------------------------------------
|
||||||
// ACCESSORS
|
// ACCESSORS
|
||||||
|
|
||||||
|
|
@ -995,6 +1180,8 @@ class EmitCImp : EmitCStmts {
|
||||||
void emitStaticDecl(AstNodeModule* modp);
|
void emitStaticDecl(AstNodeModule* modp);
|
||||||
void emitSettleLoop(const std::string& eval_call, bool initial);
|
void emitSettleLoop(const std::string& eval_call, bool initial);
|
||||||
void emitWrapEval(AstNodeModule* modp);
|
void emitWrapEval(AstNodeModule* modp);
|
||||||
|
void emitMTaskState();
|
||||||
|
void emitMTaskVertexCtors(bool* firstp);
|
||||||
void emitInt(AstNodeModule* modp);
|
void emitInt(AstNodeModule* modp);
|
||||||
void maybeSplit(AstNodeModule* modp);
|
void maybeSplit(AstNodeModule* modp);
|
||||||
|
|
||||||
|
|
@ -1534,6 +1721,36 @@ void EmitCImp::emitCoverageDecl(AstNodeModule* modp) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void EmitCImp::emitMTaskVertexCtors(bool* firstp) {
|
||||||
|
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||||
|
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Should have an execGraphp");
|
||||||
|
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||||
|
|
||||||
|
unsigned finalEdgesInCt = 0;
|
||||||
|
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||||
|
vxp; vxp = vxp->verticesNextp()) {
|
||||||
|
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||||
|
unsigned edgesInCt = packedMTaskMayBlock(mtp);
|
||||||
|
if (packedMTaskMayBlock(mtp) > 0) {
|
||||||
|
emitCtorSep(firstp);
|
||||||
|
puts("__Vm_mt_"+cvtToStr(mtp->id())+"("+cvtToStr(edgesInCt)+")");
|
||||||
|
}
|
||||||
|
// Each mtask with no packed successor will become a dependency
|
||||||
|
// for the final node:
|
||||||
|
if (!mtp->packNextp()) ++finalEdgesInCt;
|
||||||
|
}
|
||||||
|
|
||||||
|
emitCtorSep(firstp);
|
||||||
|
puts("__Vm_mt_final(" + cvtToStr(finalEdgesInCt) + ")");
|
||||||
|
|
||||||
|
// This will flip to 'true' before the start of the 0th cycle.
|
||||||
|
emitCtorSep(firstp); puts("__Vm_threadPoolp(NULL)");
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
emitCtorSep(firstp); puts("__Vm_profile_cycle_start(0)");
|
||||||
|
}
|
||||||
|
emitCtorSep(firstp); puts("__Vm_even_cycle(false)");
|
||||||
|
}
|
||||||
|
|
||||||
void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||||
puts("\n");
|
puts("\n");
|
||||||
bool first = true;
|
bool first = true;
|
||||||
|
|
@ -1544,6 +1761,9 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||||
first = false; // VL_CTOR_IMP includes the first ':'
|
first = false; // VL_CTOR_IMP includes the first ':'
|
||||||
}
|
}
|
||||||
emitVarCtors(&first);
|
emitVarCtors(&first);
|
||||||
|
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
emitMTaskVertexCtors(&first);
|
||||||
|
}
|
||||||
puts(" {\n");
|
puts(" {\n");
|
||||||
emitCellCtors(modp);
|
emitCellCtors(modp);
|
||||||
emitSensitives();
|
emitSensitives();
|
||||||
|
|
@ -1556,6 +1776,39 @@ void EmitCImp::emitCtorImp(AstNodeModule* modp) {
|
||||||
putsDecoration("// Reset structure values\n");
|
putsDecoration("// Reset structure values\n");
|
||||||
puts("_ctor_var_reset();\n");
|
puts("_ctor_var_reset();\n");
|
||||||
emitTextSection(AstType::atScCtor);
|
emitTextSection(AstType::atScCtor);
|
||||||
|
|
||||||
|
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
// TODO-- For now each top module creates its own ThreadPool here,
|
||||||
|
// and deletes it in the destructor. If A and B are each top level
|
||||||
|
// modules, each creates a separate thread pool. This allows
|
||||||
|
// A.eval() and B.eval() to run concurrently without any
|
||||||
|
// interference -- so long as the physical machine has enough cores
|
||||||
|
// to support both pools and all testbench threads.
|
||||||
|
//
|
||||||
|
// In the future, we might want to let the client provide a
|
||||||
|
// threadpool to the constructor. This would allow two or more
|
||||||
|
// models to share a single threadpool.
|
||||||
|
//
|
||||||
|
// For example: suppose models A and B are each compiled to run on
|
||||||
|
// 4 threads. The client might create a single thread pool with 3
|
||||||
|
// threads and pass it to both models. If the client can ensure tht
|
||||||
|
// A.eval() and B.eval() do NOT run concurrently, there will be no
|
||||||
|
// contention for the threads. This mode is missing for now. (Is
|
||||||
|
// there demand for such a setup?)
|
||||||
|
puts("__Vm_threadPoolp = new VlThreadPool("
|
||||||
|
// Note we create N-1 threads in the thread pool. The thread
|
||||||
|
// that calls eval() becomes the final Nth thread for the
|
||||||
|
// duration of the eval call.
|
||||||
|
+ cvtToStr(v3Global.opt.threads() - 1)
|
||||||
|
+ ", " + cvtToStr(v3Global.opt.profThreads())
|
||||||
|
+ ");\n");
|
||||||
|
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
puts("__Vm_profile_cycle_start = 0;\n");
|
||||||
|
puts("__Vm_profile_time_finished = 0;\n");
|
||||||
|
puts("__Vm_profile_window_ct = 0;");
|
||||||
|
}
|
||||||
|
}
|
||||||
puts("}\n");
|
puts("}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1597,6 +1850,9 @@ void EmitCImp::emitCoverageImp(AstNodeModule* modp) {
|
||||||
void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
|
void EmitCImp::emitDestructorImp(AstNodeModule* modp) {
|
||||||
puts("\n");
|
puts("\n");
|
||||||
puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
|
puts(modClassName(modp)+"::~"+modClassName(modp)+"() {\n");
|
||||||
|
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
puts("delete __Vm_threadPoolp; __Vm_threadPoolp = NULL;\n");
|
||||||
|
}
|
||||||
emitTextSection(AstType::atScDtor);
|
emitTextSection(AstType::atScDtor);
|
||||||
if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
|
if (modp->isTop()) puts("delete __VlSymsp; __VlSymsp=NULL;\n");
|
||||||
puts("}\n");
|
puts("}\n");
|
||||||
|
|
@ -1796,9 +2052,47 @@ void EmitCImp::emitWrapEval(AstNodeModule* modp) {
|
||||||
if (v3Global.opt.threads() == 1) {
|
if (v3Global.opt.threads() == 1) {
|
||||||
uint32_t mtaskId = 0;
|
uint32_t mtaskId = 0;
|
||||||
putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
|
putsDecoration("// MTask "+cvtToStr(mtaskId)+" start\n");
|
||||||
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask starting, mtaskId="+cvtToStr(mtaskId)+"\\n\"););\n");
|
puts("VL_DEBUG_IF(VL_DBG_MSGF(\"MTask"+cvtToStr(mtaskId)+" starting\\n\"););\n");
|
||||||
puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
|
puts("Verilated::mtaskId("+cvtToStr(mtaskId)+");\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (v3Global.opt.mtasks()
|
||||||
|
&& v3Global.opt.profThreads()) {
|
||||||
|
puts("if (VL_UNLIKELY((Verilated::profThreadsStart() != __Vm_profile_time_finished)\n");
|
||||||
|
puts( " && (VL_TIME_Q() > Verilated::profThreadsStart())\n");
|
||||||
|
puts( " && (Verilated::profThreadsWindow() >= 1))) {\n");
|
||||||
|
// Within a profile (either starting, middle, or end)
|
||||||
|
puts( "if (vlTOPp->__Vm_profile_window_ct == 0) {\n"); // Opening file?
|
||||||
|
// Start profile on this cycle. We'll capture a window worth, then
|
||||||
|
// only analyze the next window worth. The idea is that the first window
|
||||||
|
// capture will hit some cache-cold stuff (eg printf) but it'll be warm
|
||||||
|
// by the time we hit the second window, we hope.
|
||||||
|
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
||||||
|
// "* 2" as first half is warmup, second half is collection
|
||||||
|
puts( "vlTOPp->__Vm_profile_window_ct = Verilated::profThreadsWindow() * 2 + 1;\n");
|
||||||
|
puts( "}\n");
|
||||||
|
puts( "--vlTOPp->__Vm_profile_window_ct;\n");
|
||||||
|
puts( "if (vlTOPp->__Vm_profile_window_ct == (Verilated::profThreadsWindow())) {\n");
|
||||||
|
// This barrier record in every threads' profile demarcates the
|
||||||
|
// cache-warm-up cycles before the barrier from the actual profile
|
||||||
|
// cycles afterward.
|
||||||
|
puts( "vlTOPp->__Vm_threadPoolp->profileAppendAll(");
|
||||||
|
puts( "VlProfileRec(VlProfileRec::Barrier()));\n");
|
||||||
|
puts( "vlTOPp->__Vm_profile_cycle_start = VL_RDTSC_Q();\n");
|
||||||
|
puts( "}\n");
|
||||||
|
puts( "else if (vlTOPp->__Vm_profile_window_ct == 0) {\n");
|
||||||
|
// Ending file.
|
||||||
|
puts( "vluint64_t elapsed = VL_RDTSC_Q() - vlTOPp->__Vm_profile_cycle_start;\n");
|
||||||
|
puts( "vlTOPp->__Vm_threadPoolp->profileDump(Verilated::profThreadsFilenamep(), elapsed);\n");
|
||||||
|
// This turns off the test to enter the profiling code, but still
|
||||||
|
// allows the user to collect another profile by changing
|
||||||
|
// profThreadsStart
|
||||||
|
puts( "__Vm_profile_time_finished = Verilated::profThreadsStart();\n");
|
||||||
|
puts( "vlTOPp->__Vm_profile_cycle_start = 0;\n");
|
||||||
|
puts( "}\n");
|
||||||
|
puts("}\n");
|
||||||
|
}
|
||||||
|
|
||||||
emitSettleLoop(
|
emitSettleLoop(
|
||||||
(string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
|
(string("VL_DEBUG_IF(VL_DBG_MSGF(\"+ Clock loop\\n\"););\n")
|
||||||
+ (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
|
+ (v3Global.opt.trace() ? "vlSymsp->__Vm_activity = true;\n" : "")
|
||||||
|
|
@ -1832,10 +2126,13 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
|
||||||
// Put out a list of signal declarations
|
// Put out a list of signal declarations
|
||||||
// in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
|
// in order of 0:clocks, 1:vluint8, 2:vluint16, 4:vluint32, 5:vluint64, 6:wide, 7:arrays
|
||||||
// This aids cache packing and locality
|
// This aids cache packing and locality
|
||||||
// Largest->smallest reduces the number of pad variables.
|
|
||||||
// But for now, Smallest->largest makes it more likely a small offset will allow access to the signal.
|
|
||||||
// TODO: Move this sort to an earlier visitor stage.
|
|
||||||
//
|
//
|
||||||
|
// Largest->smallest reduces the number of pad variables. Also
|
||||||
|
// experimented with alternating between large->small and small->large
|
||||||
|
// on successive Mtask groups, but then when a new mtask gets added may
|
||||||
|
// cause a huge delta.
|
||||||
|
//
|
||||||
|
// TODO: Move this sort to an earlier visitor stage.
|
||||||
VarSortMap varAnonMap;
|
VarSortMap varAnonMap;
|
||||||
VarSortMap varNonanonMap;
|
VarSortMap varNonanonMap;
|
||||||
|
|
||||||
|
|
@ -1891,8 +2188,9 @@ void EmitCStmts::emitVarList(AstNode* firstp, EisWhich which, const string& pref
|
||||||
|
|
||||||
void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
|
void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
|
||||||
UASSERT(sortedp->empty(), "Sorted should be initially empty");
|
UASSERT(sortedp->empty(), "Sorted should be initially empty");
|
||||||
{
|
if (!v3Global.opt.mtasks()) {
|
||||||
// Plain old serial mode. Sort by size, from small to large.
|
// Plain old serial mode. Sort by size, from small to large,
|
||||||
|
// to optimize for both packing and small offsets in code.
|
||||||
for (VarSortMap::const_iterator it = vmap.begin();
|
for (VarSortMap::const_iterator it = vmap.begin();
|
||||||
it != vmap.end(); ++it) {
|
it != vmap.end(); ++it) {
|
||||||
for (VarVec::const_iterator jt = it->second.begin();
|
for (VarVec::const_iterator jt = it->second.begin();
|
||||||
|
|
@ -1900,12 +2198,52 @@ void EmitCStmts::emitVarSort(const VarSortMap& vmap, VarVec* sortedp) {
|
||||||
sortedp->push_back(*jt);
|
sortedp->push_back(*jt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// MacroTask mode. Sort by MTask-affinity group first, size second.
|
||||||
|
typedef std::map<MTaskIdSet, VarSortMap> MTaskVarSortMap;
|
||||||
|
MTaskVarSortMap m2v;
|
||||||
|
for (VarSortMap::const_iterator it = vmap.begin(); it != vmap.end(); ++it) {
|
||||||
|
int size_class = it->first;
|
||||||
|
const VarVec& vec = it->second;
|
||||||
|
for (VarVec::const_iterator jt = vec.begin(); jt != vec.end(); ++jt) {
|
||||||
|
const AstVar* varp = *jt;
|
||||||
|
m2v[varp->mtaskIds()][size_class].push_back(varp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a TSP sort state for each MTaskIdSet footprint
|
||||||
|
V3TSP::StateVec states;
|
||||||
|
for (MTaskVarSortMap::iterator it = m2v.begin(); it != m2v.end(); ++it) {
|
||||||
|
states.push_back(new EmitVarTspSorter(it->first));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do the TSP sort
|
||||||
|
V3TSP::StateVec sorted_states;
|
||||||
|
V3TSP::tspSort(states, &sorted_states);
|
||||||
|
|
||||||
|
for (V3TSP::StateVec::iterator it = sorted_states.begin();
|
||||||
|
it != sorted_states.end(); ++it) {
|
||||||
|
const EmitVarTspSorter* statep = dynamic_cast<const EmitVarTspSorter*>(*it);
|
||||||
|
const VarSortMap& localVmap = m2v[statep->mtaskIds()];
|
||||||
|
// use rbegin/rend to sort size large->small
|
||||||
|
for (VarSortMap::const_reverse_iterator jt = localVmap.rbegin();
|
||||||
|
jt != localVmap.rend(); ++jt) {
|
||||||
|
const VarVec& vec = jt->second;
|
||||||
|
for (VarVec::const_iterator kt = vec.begin();
|
||||||
|
kt != vec.end(); ++kt) {
|
||||||
|
sortedp->push_back(*kt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete statep; VL_DANGLING(statep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||||
const VarVec& nonanons,
|
const VarVec& nonanons,
|
||||||
const string& prefixIfImp) {
|
const string& prefixIfImp) {
|
||||||
|
string curVarCmt = "";
|
||||||
// Output anons
|
// Output anons
|
||||||
{
|
{
|
||||||
int anonMembers = anons.size();
|
int anonMembers = anons.size();
|
||||||
|
|
@ -1933,6 +2271,7 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||||
if (anonL1s != 1) puts("struct {\n");
|
if (anonL1s != 1) puts("struct {\n");
|
||||||
for (int l0=0; l0<lim && it != anons.end(); ++l0) {
|
for (int l0=0; l0<lim && it != anons.end(); ++l0) {
|
||||||
const AstVar* varp = *it;
|
const AstVar* varp = *it;
|
||||||
|
emitVarCmtChg(varp, &curVarCmt);
|
||||||
emitVarDecl(varp, prefixIfImp);
|
emitVarDecl(varp, prefixIfImp);
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
|
|
@ -1945,12 +2284,14 @@ void EmitCStmts::emitSortedVarList(const VarVec& anons,
|
||||||
// Leftovers, just in case off by one error somewhere above
|
// Leftovers, just in case off by one error somewhere above
|
||||||
for (; it != anons.end(); ++it) {
|
for (; it != anons.end(); ++it) {
|
||||||
const AstVar* varp = *it;
|
const AstVar* varp = *it;
|
||||||
|
emitVarCmtChg(varp, &curVarCmt);
|
||||||
emitVarDecl(varp, prefixIfImp);
|
emitVarDecl(varp, prefixIfImp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Output nonanons
|
// Output nonanons
|
||||||
for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
|
for (VarVec::const_iterator it = nonanons.begin(); it != nonanons.end(); ++it) {
|
||||||
const AstVar* varp = *it;
|
const AstVar* varp = *it;
|
||||||
|
emitVarCmtChg(varp, &curVarCmt);
|
||||||
emitVarDecl(varp, prefixIfImp);
|
emitVarDecl(varp, prefixIfImp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1986,6 +2327,59 @@ void EmitCImp::emitIntFuncDecls(AstNodeModule* modp) {
|
||||||
if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
|
if (funcp->ifdef()!="") puts("#endif // "+funcp->ifdef()+"\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
// Emit the mtask func prototypes.
|
||||||
|
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||||
|
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
|
||||||
|
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||||
|
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||||
|
vxp; vxp = vxp->verticesNextp()) {
|
||||||
|
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||||
|
if (mtp->threadRoot()) {
|
||||||
|
// Emit function declaration for this mtask
|
||||||
|
ofp()->putsPrivate(true);
|
||||||
|
puts("static void "); puts(mtp->cFuncName());
|
||||||
|
puts("(bool even_cycle, void* symtab);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// No AstCFunc for this one, as it's synthetic. Just write it:
|
||||||
|
puts("static void __Vmtask__final(bool even_cycle, void* symtab);\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmitCImp::emitMTaskState() {
|
||||||
|
ofp()->putsPrivate(true);
|
||||||
|
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||||
|
if (!execGraphp) v3Global.rootp()->v3fatalSrc("Root should have an execGraphp");
|
||||||
|
|
||||||
|
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||||
|
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||||
|
vxp; vxp = vxp->verticesNextp()) {
|
||||||
|
const ExecMTask* mtp = dynamic_cast<const ExecMTask*>(vxp);
|
||||||
|
if (packedMTaskMayBlock(mtp) > 0) {
|
||||||
|
puts("VlMTaskVertex __Vm_mt_" + cvtToStr(mtp->id()) + ";\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// This fake mtask depends on all the real ones. We use it to block
|
||||||
|
// eval() until all mtasks are done.
|
||||||
|
//
|
||||||
|
// In the future we might allow _eval() to return before the graph is
|
||||||
|
// fully done executing, for "half wave" scheduling. For now we wait
|
||||||
|
// for all mtasks though.
|
||||||
|
puts("VlMTaskVertex __Vm_mt_final;\n");
|
||||||
|
puts("VlThreadPool* __Vm_threadPoolp;\n");
|
||||||
|
|
||||||
|
if (v3Global.opt.profThreads()) {
|
||||||
|
// rdtsc() at current cycle start
|
||||||
|
puts("vluint64_t __Vm_profile_cycle_start;\n");
|
||||||
|
// Time we finished analysis
|
||||||
|
puts("vluint64_t __Vm_profile_time_finished;\n");
|
||||||
|
// Track our position in the cache warmup and actual profile window
|
||||||
|
puts("vluint32_t __Vm_profile_window_ct;\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
puts("bool __Vm_even_cycle;\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmitCImp::emitInt(AstNodeModule* modp) {
|
void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||||
|
|
@ -2000,6 +2394,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||||
} else {
|
} else {
|
||||||
puts("#include \"verilated.h\"\n");
|
puts("#include \"verilated.h\"\n");
|
||||||
}
|
}
|
||||||
|
if (v3Global.opt.mtasks()) {
|
||||||
|
puts("#include \"verilated_threads.h\"\n");
|
||||||
|
}
|
||||||
if (v3Global.opt.savable()) {
|
if (v3Global.opt.savable()) {
|
||||||
puts("#include \"verilated_save.h\"\n");
|
puts("#include \"verilated_save.h\"\n");
|
||||||
}
|
}
|
||||||
|
|
@ -2084,6 +2481,9 @@ void EmitCImp::emitInt(AstNodeModule* modp) {
|
||||||
puts("bool __Vm_inhibitSim; ///< Set true to disable evaluation of module\n");
|
puts("bool __Vm_inhibitSim; ///< Set true to disable evaluation of module\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
emitMTaskState();
|
||||||
|
}
|
||||||
emitCoverageDecl(modp); // may flip public/private
|
emitCoverageDecl(modp); // may flip public/private
|
||||||
|
|
||||||
puts("\n// PARAMETERS\n");
|
puts("\n// PARAMETERS\n");
|
||||||
|
|
@ -2291,6 +2691,24 @@ void EmitCImp::main(AstNodeModule* modp, bool slow, bool fast) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (fast && modp->isTop() && v3Global.opt.mtasks()) {
|
||||||
|
// Make a final pass and emit function definitions for the mtasks
|
||||||
|
// in the ExecGraph
|
||||||
|
AstExecGraph* execGraphp = v3Global.rootp()->execGraphp();
|
||||||
|
const V3Graph* depGraphp = execGraphp->depGraphp();
|
||||||
|
for (const V3GraphVertex* vxp = depGraphp->verticesBeginp();
|
||||||
|
vxp; vxp = vxp->verticesNextp()) {
|
||||||
|
const ExecMTask* mtaskp = dynamic_cast<const ExecMTask*>(vxp);
|
||||||
|
if (mtaskp->threadRoot()) {
|
||||||
|
maybeSplit(modp);
|
||||||
|
// Only define one function for all the mtasks packed on
|
||||||
|
// a given thread. We'll name this function after the
|
||||||
|
// root mtask though it contains multiple mtasks' worth
|
||||||
|
// of logic.
|
||||||
|
iterate(mtaskp->bodyp());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
delete m_ofp; m_ofp=NULL;
|
delete m_ofp; m_ofp=NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -94,6 +94,9 @@ public:
|
||||||
putMakeClassEntry(of, "verilated_vcd_sc.cpp");
|
putMakeClassEntry(of, "verilated_vcd_sc.cpp");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (v3Global.opt.mtasks()) {
|
||||||
|
putMakeClassEntry(of, "verilated_threads.cpp");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (support==2 && slow) {
|
else if (support==2 && slow) {
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -131,7 +131,7 @@ public:
|
||||||
"ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
|
"ALWCOMBORDER", "ASSIGNDLY", "ASSIGNIN",
|
||||||
"BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
|
"BLKANDNBLK", "BLKLOOPINIT", "BLKSEQ", "BSSPACE",
|
||||||
"CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
|
"CASEINCOMPLETE", "CASEOVERLAP", "CASEWITHX", "CASEX", "CDCRSTLOGIC", "CLKDATA",
|
||||||
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
|
"CMPCONST", "COLONPLUS", "COMBDLY", "DEFPARAM", "DECLFILENAME",
|
||||||
"ENDLABEL", "GENCLK",
|
"ENDLABEL", "GENCLK",
|
||||||
"IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
|
"IFDEPTH", "IMPERFECTSCH", "IMPLICIT", "IMPURE",
|
||||||
"INCABSPATH", "INFINITELOOP", "INITIALDLY",
|
"INCABSPATH", "INFINITELOOP", "INITIALDLY",
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,8 @@
|
||||||
#include VL_INCLUDE_UNORDERED_MAP
|
#include VL_INCLUDE_UNORDERED_MAP
|
||||||
|
|
||||||
#include "V3Global.h"
|
#include "V3Global.h"
|
||||||
|
#include "V3PartitionGraph.h"
|
||||||
|
#include "V3GraphPathChecker.h"
|
||||||
#include "V3LifePost.h"
|
#include "V3LifePost.h"
|
||||||
#include "V3Stats.h"
|
#include "V3Stats.h"
|
||||||
#include "V3Ast.h"
|
#include "V3Ast.h"
|
||||||
|
|
@ -78,6 +80,11 @@ private:
|
||||||
iterate(nodep->funcp());
|
iterate(nodep->funcp());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
virtual void visit(AstExecGraph* nodep) {
|
||||||
|
// Can just iterate across the MTask bodies in any order. Order
|
||||||
|
// isn't important for LifePostElimVisitor's simple substitution.
|
||||||
|
iterateChildren(nodep);
|
||||||
|
}
|
||||||
virtual void visit(AstCFunc* nodep) {
|
virtual void visit(AstCFunc* nodep) {
|
||||||
if (!m_tracingCall && !nodep->entryPoint()) return;
|
if (!m_tracingCall && !nodep->entryPoint()) return;
|
||||||
m_tracingCall = false;
|
m_tracingCall = false;
|
||||||
|
|
@ -101,11 +108,17 @@ public:
|
||||||
// and a sequence number within the mtask:
|
// and a sequence number within the mtask:
|
||||||
|
|
||||||
struct LifeLocation {
|
struct LifeLocation {
|
||||||
|
const ExecMTask* mtaskp;
|
||||||
uint32_t sequence;
|
uint32_t sequence;
|
||||||
public:
|
public:
|
||||||
LifeLocation() : sequence(0) {}
|
LifeLocation() : mtaskp(NULL), sequence(0) {}
|
||||||
LifeLocation(uint32_t sequence_) : sequence(sequence_) {}
|
LifeLocation(const ExecMTask* mtaskp_, uint32_t sequence_)
|
||||||
|
: mtaskp(mtaskp_), sequence(sequence_) {}
|
||||||
bool operator< (const LifeLocation& b) const {
|
bool operator< (const LifeLocation& b) const {
|
||||||
|
unsigned a_id = mtaskp ? mtaskp->id() : 0;
|
||||||
|
unsigned b_id = b.mtaskp ? b.mtaskp->id() : 0;
|
||||||
|
if (a_id < b_id) { return true; }
|
||||||
|
if (b_id < a_id) { return false; }
|
||||||
return sequence < b.sequence;
|
return sequence < b.sequence;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
@ -130,6 +143,9 @@ private:
|
||||||
|
|
||||||
// STATE
|
// STATE
|
||||||
uint32_t m_sequence; // Sequence number of assigns/varrefs,
|
uint32_t m_sequence; // Sequence number of assigns/varrefs,
|
||||||
|
// // local to the current MTask.
|
||||||
|
const ExecMTask* m_execMTaskp; // Current ExecMTask being processed,
|
||||||
|
// // or NULL for serial code.
|
||||||
V3Double0 m_statAssnDel; // Statistic tracking
|
V3Double0 m_statAssnDel; // Statistic tracking
|
||||||
bool m_tracingCall; // Currently tracing a CCall to a CFunc
|
bool m_tracingCall; // Currently tracing a CCall to a CFunc
|
||||||
|
|
||||||
|
|
@ -143,11 +159,15 @@ private:
|
||||||
typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
|
typedef vl_unordered_map<const AstVarScope*, LifePostLocation> PostLocMap;
|
||||||
PostLocMap m_assignposts; // AssignPost dly var locations
|
PostLocMap m_assignposts; // AssignPost dly var locations
|
||||||
|
|
||||||
|
const V3Graph* m_mtasksGraphp; // Mtask tracking graph
|
||||||
|
vl_unique_ptr<GraphPathChecker> m_checker;
|
||||||
|
|
||||||
// METHODS
|
// METHODS
|
||||||
VL_DEBUG_FUNC; // Declare debug()
|
VL_DEBUG_FUNC; // Declare debug()
|
||||||
|
|
||||||
static bool before(const LifeLocation& a, const LifeLocation& b) {
|
bool before(const LifeLocation& a, const LifeLocation& b) {
|
||||||
return a.sequence < b.sequence;
|
if (a.mtaskp == b.mtaskp) return a.sequence < b.sequence;
|
||||||
|
return m_checker->pathExistsFrom(a.mtaskp, b.mtaskp);
|
||||||
}
|
}
|
||||||
bool outsideCriticalArea(LifeLocation loc,
|
bool outsideCriticalArea(LifeLocation loc,
|
||||||
const std::set<LifeLocation>& dlyVarAssigns,
|
const std::set<LifeLocation>& dlyVarAssigns,
|
||||||
|
|
@ -159,6 +179,13 @@ private:
|
||||||
// Otherwise, loc could fall in the "critical" area where the
|
// Otherwise, loc could fall in the "critical" area where the
|
||||||
// substitution affects the result of the operation at loc, so
|
// substitution affects the result of the operation at loc, so
|
||||||
// return false.
|
// return false.
|
||||||
|
if (!loc.mtaskp && assignPostLoc.mtaskp) {
|
||||||
|
// This is threaded mode; 'loc' is something that happens at
|
||||||
|
// initial/settle time, or perhaps in _eval() but outside of
|
||||||
|
// the mtask graph.
|
||||||
|
// In either case, it's not in the critical area.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (before(assignPostLoc, loc)) return true;
|
if (before(assignPostLoc, loc)) return true;
|
||||||
for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
|
for (std::set<LifeLocation>::iterator it = dlyVarAssigns.begin();
|
||||||
it != dlyVarAssigns.end(); ++it) {
|
it != dlyVarAssigns.end(); ++it) {
|
||||||
|
|
@ -239,6 +266,17 @@ private:
|
||||||
// within the mtask) where each varscope is read, and written.
|
// within the mtask) where each varscope is read, and written.
|
||||||
iterateChildren(nodep);
|
iterateChildren(nodep);
|
||||||
|
|
||||||
|
if (v3Global.opt.mtasks()) {
|
||||||
|
if (!m_mtasksGraphp) {
|
||||||
|
nodep->v3fatalSrc("Should have initted m_mtasksGraphp by now");
|
||||||
|
}
|
||||||
|
m_checker.reset(new GraphPathChecker(m_mtasksGraphp));
|
||||||
|
} else {
|
||||||
|
if (m_mtasksGraphp) {
|
||||||
|
nodep->v3fatalSrc("Did not expect any m_mtasksGraphp in serial mode");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Find all assignposts. Determine which ones can be
|
// Find all assignposts. Determine which ones can be
|
||||||
// eliminated. Remove those, and mark their dly vars' user4 field
|
// eliminated. Remove those, and mark their dly vars' user4 field
|
||||||
// to indicate we should replace these dly vars with their original
|
// to indicate we should replace these dly vars with their original
|
||||||
|
|
@ -252,7 +290,8 @@ private:
|
||||||
// Consumption/generation of a variable,
|
// Consumption/generation of a variable,
|
||||||
AstVarScope* vscp = nodep->varScopep();
|
AstVarScope* vscp = nodep->varScopep();
|
||||||
if (!vscp) nodep->v3fatalSrc("Scope not assigned");
|
if (!vscp) nodep->v3fatalSrc("Scope not assigned");
|
||||||
LifeLocation loc(++m_sequence);
|
|
||||||
|
LifeLocation loc(m_execMTaskp, ++m_sequence);
|
||||||
if (nodep->lvalue()) {
|
if (nodep->lvalue()) {
|
||||||
m_writes[vscp].insert(loc);
|
m_writes[vscp].insert(loc);
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -275,7 +314,7 @@ private:
|
||||||
if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
|
if (m_assignposts.find(dlyVarp) != m_assignposts.end()) {
|
||||||
nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
|
nodep->v3fatalSrc("LifePostLocation attempted duplicate dlyvar map addition");
|
||||||
}
|
}
|
||||||
LifeLocation loc(++m_sequence);
|
LifeLocation loc(m_execMTaskp, ++m_sequence);
|
||||||
m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
|
m_assignposts[dlyVarp] = LifePostLocation(loc, nodep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -291,6 +330,18 @@ private:
|
||||||
iterate(nodep->funcp());
|
iterate(nodep->funcp());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
virtual void visit(AstExecGraph* nodep) {
|
||||||
|
// Treat the ExecGraph like a call to each mtask body
|
||||||
|
m_mtasksGraphp = nodep->depGraphp();
|
||||||
|
for (V3GraphVertex* mtaskVxp = m_mtasksGraphp->verticesBeginp();
|
||||||
|
mtaskVxp; mtaskVxp = mtaskVxp->verticesNextp()) {
|
||||||
|
ExecMTask* mtaskp = dynamic_cast<ExecMTask*>(mtaskVxp);
|
||||||
|
m_execMTaskp = mtaskp;
|
||||||
|
m_sequence = 0;
|
||||||
|
iterate(mtaskp->bodyp());
|
||||||
|
}
|
||||||
|
m_execMTaskp = NULL;
|
||||||
|
}
|
||||||
virtual void visit(AstCFunc* nodep) {
|
virtual void visit(AstCFunc* nodep) {
|
||||||
if (!m_tracingCall && !nodep->entryPoint()) return;
|
if (!m_tracingCall && !nodep->entryPoint()) return;
|
||||||
m_tracingCall = false;
|
m_tracingCall = false;
|
||||||
|
|
@ -305,7 +356,9 @@ public:
|
||||||
// CONSTRUCTORS
|
// CONSTRUCTORS
|
||||||
explicit LifePostDlyVisitor(AstNetlist* nodep)
|
explicit LifePostDlyVisitor(AstNetlist* nodep)
|
||||||
: m_sequence(0)
|
: m_sequence(0)
|
||||||
, m_tracingCall(false) {
|
, m_execMTaskp(NULL)
|
||||||
|
, m_tracingCall(false)
|
||||||
|
, m_mtasksGraphp(NULL) {
|
||||||
iterate(nodep);
|
iterate(nodep);
|
||||||
}
|
}
|
||||||
virtual ~LifePostDlyVisitor() {
|
virtual ~LifePostDlyVisitor() {
|
||||||
|
|
|
||||||
|
|
@ -661,6 +661,9 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||||
else if ( !strcmp (sw, "-debug-abort") ) { abort(); } // Undocumented, see also --debug-sigsegv
|
else if ( !strcmp (sw, "-debug-abort") ) { abort(); } // Undocumented, see also --debug-sigsegv
|
||||||
else if ( onoff (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
|
else if ( onoff (sw, "-debug-check", flag/*ref*/) ){ m_debugCheck = flag; }
|
||||||
else if ( onoff (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
|
else if ( onoff (sw, "-debug-leak", flag/*ref*/) ){ m_debugLeak = flag; }
|
||||||
|
else if ( onoff (sw, "-debug-nondeterminism", flag/*ref*/) ){ m_debugNondeterminism = flag; }
|
||||||
|
else if ( onoff (sw, "-debug-partition", flag/*ref*/) ){ m_debugPartition = flag; } // Undocumented
|
||||||
|
else if ( onoff (sw, "-debug-self-test", flag/*ref*/) ){ m_debugSelfTest = flag; } // Undocumented
|
||||||
else if ( !strcmp (sw, "-debug-sigsegv") ) { throwSigsegv(); } // Undocumented, see also --debug-abort
|
else if ( !strcmp (sw, "-debug-sigsegv") ) { throwSigsegv(); } // Undocumented, see also --debug-abort
|
||||||
else if ( !strcmp (sw, "-debug-fatalsrc") ) { v3fatalSrc("--debug-fatal-src"); } // Undocumented, see also --debug-abort
|
else if ( !strcmp (sw, "-debug-fatalsrc") ) { v3fatalSrc("--debug-fatal-src"); } // Undocumented, see also --debug-abort
|
||||||
else if ( onoff (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
|
else if ( onoff (sw, "-decoration", flag/*ref*/) ) { m_decoration = flag; }
|
||||||
|
|
@ -678,6 +681,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||||
else if ( !strcmp (sw, "-private") ) { m_public = false; }
|
else if ( !strcmp (sw, "-private") ) { m_public = false; }
|
||||||
else if ( onoff (sw, "-prof-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; }
|
else if ( onoff (sw, "-prof-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; }
|
||||||
else if ( onoff (sw, "-profile-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; } // Undocumented, for backward compat
|
else if ( onoff (sw, "-profile-cfuncs", flag/*ref*/) ) { m_profCFuncs = flag; } // Undocumented, for backward compat
|
||||||
|
else if ( onoff (sw, "-prof-threads", flag/*ref*/) ) { m_profThreads = flag; }
|
||||||
else if ( onoff (sw, "-public", flag/*ref*/) ) { m_public = flag; }
|
else if ( onoff (sw, "-public", flag/*ref*/) ) { m_public = flag; }
|
||||||
else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+"))) { addParameter(string(sw+strlen("-pvalue+")), false); }
|
else if ( !strncmp(sw, "-pvalue+", strlen("-pvalue+"))) { addParameter(string(sw+strlen("-pvalue+")), false); }
|
||||||
else if ( onoff (sw, "-relative-cfuncs", flag/*ref*/) ) { m_relativeCFuncs = flag; }
|
else if ( onoff (sw, "-relative-cfuncs", flag/*ref*/) ) { m_relativeCFuncs = flag; }
|
||||||
|
|
@ -689,6 +693,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||||
else if ( onoff (sw, "-stats", flag/*ref*/) ) { m_stats = flag; }
|
else if ( onoff (sw, "-stats", flag/*ref*/) ) { m_stats = flag; }
|
||||||
else if ( onoff (sw, "-stats-vars", flag/*ref*/) ) { m_statsVars = flag; m_stats |= flag; }
|
else if ( onoff (sw, "-stats-vars", flag/*ref*/) ) { m_statsVars = flag; m_stats |= flag; }
|
||||||
else if ( !strcmp (sw, "-sv") ) { m_defaultLanguage = V3LangCode::L1800_2005; }
|
else if ( !strcmp (sw, "-sv") ) { m_defaultLanguage = V3LangCode::L1800_2005; }
|
||||||
|
else if ( onoff (sw, "-threads-coarsen", flag/*ref*/)) { m_threadsCoarsen = flag; } // Undocumented, debug
|
||||||
else if ( onoff (sw, "-trace", flag/*ref*/) ) { m_trace = flag; }
|
else if ( onoff (sw, "-trace", flag/*ref*/) ) { m_trace = flag; }
|
||||||
else if ( onoff (sw, "-trace-dups", flag/*ref*/) ) { m_traceDups = flag; }
|
else if ( onoff (sw, "-trace-dups", flag/*ref*/) ) { m_traceDups = flag; }
|
||||||
else if ( onoff (sw, "-trace-params", flag/*ref*/) ) { m_traceParams = flag; }
|
else if ( onoff (sw, "-trace-params", flag/*ref*/) ) { m_traceParams = flag; }
|
||||||
|
|
@ -1013,6 +1018,20 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char
|
||||||
shift; m_threads = atoi(argv[i]);
|
shift; m_threads = atoi(argv[i]);
|
||||||
if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
|
if (m_threads < 0) fl->v3fatal("--threads must be >= 0: "<<argv[i]);
|
||||||
}
|
}
|
||||||
|
else if ( !strcmp (sw, "-threads-dpi") && (i+1)<argc) {
|
||||||
|
shift;
|
||||||
|
if (!strcmp(argv[i], "all")) { m_threadsDpiPure=true; m_threadsDpiUnpure=true; }
|
||||||
|
else if (!strcmp(argv[i], "none")) { m_threadsDpiPure=false; m_threadsDpiUnpure=false; }
|
||||||
|
else if (!strcmp(argv[i], "pure")) { m_threadsDpiPure=true; m_threadsDpiUnpure=false; }
|
||||||
|
else {
|
||||||
|
fl->v3fatal("Unknown setting for --threads-dpi: "<<argv[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( !strcmp (sw, "-threads-max-mtasks") ) {
|
||||||
|
shift; m_threadsMaxMTasks = atoi(argv[i]);
|
||||||
|
if (m_threadsMaxMTasks < 1)
|
||||||
|
fl->v3fatal("--threads-max-mtasks must be >= 1: "<<argv[i]);
|
||||||
|
}
|
||||||
else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
|
else if ( !strcmp (sw, "-top-module") && (i+1)<argc ) {
|
||||||
shift; m_topModule = argv[i];
|
shift; m_topModule = argv[i];
|
||||||
}
|
}
|
||||||
|
|
@ -1223,6 +1242,9 @@ V3Options::V3Options() {
|
||||||
m_coverageUser = false;
|
m_coverageUser = false;
|
||||||
m_debugCheck = false;
|
m_debugCheck = false;
|
||||||
m_debugLeak = true;
|
m_debugLeak = true;
|
||||||
|
m_debugNondeterminism = false;
|
||||||
|
m_debugPartition = false;
|
||||||
|
m_debugSelfTest = false;
|
||||||
m_decoration = true;
|
m_decoration = true;
|
||||||
m_exe = false;
|
m_exe = false;
|
||||||
m_ignc = false;
|
m_ignc = false;
|
||||||
|
|
@ -1237,6 +1259,7 @@ V3Options::V3Options() {
|
||||||
m_pinsScBigUint = false;
|
m_pinsScBigUint = false;
|
||||||
m_pinsUint8 = false;
|
m_pinsUint8 = false;
|
||||||
m_profCFuncs = false;
|
m_profCFuncs = false;
|
||||||
|
m_profThreads = false;
|
||||||
m_preprocOnly = false;
|
m_preprocOnly = false;
|
||||||
m_preprocNoLine = false;
|
m_preprocNoLine = false;
|
||||||
m_public = false;
|
m_public = false;
|
||||||
|
|
@ -1249,6 +1272,10 @@ V3Options::V3Options() {
|
||||||
m_statsVars = false;
|
m_statsVars = false;
|
||||||
m_systemC = false;
|
m_systemC = false;
|
||||||
m_threads = 0;
|
m_threads = 0;
|
||||||
|
m_threadsDpiPure = true;
|
||||||
|
m_threadsDpiUnpure = false;
|
||||||
|
m_threadsCoarsen = true;
|
||||||
|
m_threadsMaxMTasks = 0;
|
||||||
m_trace = false;
|
m_trace = false;
|
||||||
m_traceDups = false;
|
m_traceDups = false;
|
||||||
m_traceParams = true;
|
m_traceParams = true;
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,10 @@ class V3Options {
|
||||||
bool m_coverageUnderscore;// main switch: --coverage-underscore
|
bool m_coverageUnderscore;// main switch: --coverage-underscore
|
||||||
bool m_coverageUser; // main switch: --coverage-func
|
bool m_coverageUser; // main switch: --coverage-func
|
||||||
bool m_debugCheck; // main switch: --debug-check
|
bool m_debugCheck; // main switch: --debug-check
|
||||||
bool m_debugLeak; // main switch: --debug-leak
|
bool m_debugLeak; // main switch: --debug-leak
|
||||||
|
bool m_debugNondeterminism; // main switch: --debug-nondeterminism
|
||||||
|
bool m_debugPartition; // main switch: --debug-partition
|
||||||
|
bool m_debugSelfTest; // main switch: --debug-self-test
|
||||||
bool m_decoration; // main switch: --decoration
|
bool m_decoration; // main switch: --decoration
|
||||||
bool m_exe; // main switch: --exe
|
bool m_exe; // main switch: --exe
|
||||||
bool m_ignc; // main switch: --ignc
|
bool m_ignc; // main switch: --ignc
|
||||||
|
|
@ -87,6 +90,7 @@ class V3Options {
|
||||||
bool m_pinsScBigUint;// main switch: --pins-sc-biguint
|
bool m_pinsScBigUint;// main switch: --pins-sc-biguint
|
||||||
bool m_pinsUint8; // main switch: --pins-uint8
|
bool m_pinsUint8; // main switch: --pins-uint8
|
||||||
bool m_profCFuncs; // main switch: --prof-cfuncs
|
bool m_profCFuncs; // main switch: --prof-cfuncs
|
||||||
|
bool m_profThreads; // main switch: --prof-threads
|
||||||
bool m_public; // main switch: --public
|
bool m_public; // main switch: --public
|
||||||
bool m_relativeCFuncs; // main switch: --relative-cfuncs
|
bool m_relativeCFuncs; // main switch: --relative-cfuncs
|
||||||
bool m_relativeIncludes; // main switch: --relative-includes
|
bool m_relativeIncludes; // main switch: --relative-includes
|
||||||
|
|
@ -96,6 +100,9 @@ class V3Options {
|
||||||
bool m_skipIdentical;// main switch: --skip-identical
|
bool m_skipIdentical;// main switch: --skip-identical
|
||||||
bool m_stats; // main switch: --stats
|
bool m_stats; // main switch: --stats
|
||||||
bool m_statsVars; // main switch: --stats-vars
|
bool m_statsVars; // main switch: --stats-vars
|
||||||
|
bool m_threadsCoarsen; // main switch: --threads-coarsen
|
||||||
|
bool m_threadsDpiPure; // main switch: --threads-dpi all/pure
|
||||||
|
bool m_threadsDpiUnpure; // main switch: --threads-dpi all
|
||||||
bool m_trace; // main switch: --trace
|
bool m_trace; // main switch: --trace
|
||||||
bool m_traceDups; // main switch: --trace-dups
|
bool m_traceDups; // main switch: --trace-dups
|
||||||
bool m_traceParams; // main switch: --trace-params
|
bool m_traceParams; // main switch: --trace-params
|
||||||
|
|
@ -117,6 +124,7 @@ class V3Options {
|
||||||
int m_outputSplitCTrace;// main switch: --output-split-ctrace
|
int m_outputSplitCTrace;// main switch: --output-split-ctrace
|
||||||
int m_pinsBv; // main switch: --pins-bv
|
int m_pinsBv; // main switch: --pins-bv
|
||||||
int m_threads; // main switch: --threads (0 == --no-threads)
|
int m_threads; // main switch: --threads (0 == --no-threads)
|
||||||
|
int m_threadsMaxMTasks; // main switch: --threads-max-mtasks
|
||||||
int m_traceDepth; // main switch: --trace-depth
|
int m_traceDepth; // main switch: --trace-depth
|
||||||
int m_traceMaxArray;// main switch: --trace-max-array
|
int m_traceMaxArray;// main switch: --trace-max-array
|
||||||
int m_traceMaxWidth;// main switch: --trace-max-width
|
int m_traceMaxWidth;// main switch: --trace-max-width
|
||||||
|
|
@ -232,8 +240,14 @@ class V3Options {
|
||||||
bool coverageUser() const { return m_coverageUser; }
|
bool coverageUser() const { return m_coverageUser; }
|
||||||
bool debugCheck() const { return m_debugCheck; }
|
bool debugCheck() const { return m_debugCheck; }
|
||||||
bool debugLeak() const { return m_debugLeak; }
|
bool debugLeak() const { return m_debugLeak; }
|
||||||
|
bool debugNondeterminism() const { return m_debugNondeterminism; }
|
||||||
|
bool debugPartition() const { return m_debugPartition; }
|
||||||
|
bool debugSelfTest() const { return m_debugSelfTest; }
|
||||||
bool decoration() const { return m_decoration; }
|
bool decoration() const { return m_decoration; }
|
||||||
bool exe() const { return m_exe; }
|
bool exe() const { return m_exe; }
|
||||||
|
bool threadsDpiPure() const { return m_threadsDpiPure; }
|
||||||
|
bool threadsDpiUnpure() const { return m_threadsDpiUnpure; }
|
||||||
|
bool threadsCoarsen() const { return m_threadsCoarsen; }
|
||||||
bool trace() const { return m_trace; }
|
bool trace() const { return m_trace; }
|
||||||
bool traceDups() const { return m_traceDups; }
|
bool traceDups() const { return m_traceDups; }
|
||||||
bool traceParams() const { return m_traceParams; }
|
bool traceParams() const { return m_traceParams; }
|
||||||
|
|
@ -246,6 +260,7 @@ class V3Options {
|
||||||
bool pinsScBigUint() const { return m_pinsScBigUint; }
|
bool pinsScBigUint() const { return m_pinsScBigUint; }
|
||||||
bool pinsUint8() const { return m_pinsUint8; }
|
bool pinsUint8() const { return m_pinsUint8; }
|
||||||
bool profCFuncs() const { return m_profCFuncs; }
|
bool profCFuncs() const { return m_profCFuncs; }
|
||||||
|
bool profThreads() const { return m_profThreads; }
|
||||||
bool allPublic() const { return m_public; }
|
bool allPublic() const { return m_public; }
|
||||||
bool lintOnly() const { return m_lintOnly; }
|
bool lintOnly() const { return m_lintOnly; }
|
||||||
bool ignc() const { return m_ignc; }
|
bool ignc() const { return m_ignc; }
|
||||||
|
|
@ -267,6 +282,7 @@ class V3Options {
|
||||||
int outputSplitCTrace() const { return m_outputSplitCTrace; }
|
int outputSplitCTrace() const { return m_outputSplitCTrace; }
|
||||||
int pinsBv() const { return m_pinsBv; }
|
int pinsBv() const { return m_pinsBv; }
|
||||||
int threads() const { return m_threads; }
|
int threads() const { return m_threads; }
|
||||||
|
int threadsMaxMTasks() const { return m_threadsMaxMTasks; }
|
||||||
bool mtasks() const { return (m_threads > 1); }
|
bool mtasks() const { return (m_threads > 1); }
|
||||||
int traceDepth() const { return m_traceDepth; }
|
int traceDepth() const { return m_traceDepth; }
|
||||||
int traceMaxArray() const { return m_traceMaxArray; }
|
int traceMaxArray() const { return m_traceMaxArray; }
|
||||||
|
|
|
||||||
267
src/V3Order.cpp
267
src/V3Order.cpp
|
|
@ -89,19 +89,22 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "V3Global.h"
|
|
||||||
#include "V3File.h"
|
|
||||||
#include "V3Ast.h"
|
#include "V3Ast.h"
|
||||||
|
#include "V3Const.h"
|
||||||
|
#include "V3EmitCBase.h"
|
||||||
|
#include "V3EmitV.h"
|
||||||
|
#include "V3File.h"
|
||||||
|
#include "V3Global.h"
|
||||||
#include "V3Graph.h"
|
#include "V3Graph.h"
|
||||||
|
#include "V3GraphStream.h"
|
||||||
#include "V3List.h"
|
#include "V3List.h"
|
||||||
|
#include "V3Partition.h"
|
||||||
|
#include "V3PartitionGraph.h"
|
||||||
#include "V3SenTree.h"
|
#include "V3SenTree.h"
|
||||||
#include "V3Stats.h"
|
#include "V3Stats.h"
|
||||||
#include "V3EmitCBase.h"
|
|
||||||
#include "V3Const.h"
|
|
||||||
|
|
||||||
#include "V3Order.h"
|
#include "V3Order.h"
|
||||||
#include "V3OrderGraph.h"
|
#include "V3OrderGraph.h"
|
||||||
#include "V3EmitV.h"
|
|
||||||
|
|
||||||
#include VL_INCLUDE_UNORDERED_MAP
|
#include VL_INCLUDE_UNORDERED_MAP
|
||||||
#include VL_INCLUDE_UNORDERED_SET
|
#include VL_INCLUDE_UNORDERED_SET
|
||||||
|
|
@ -423,10 +426,15 @@ class ProcessMoveBuildGraph {
|
||||||
// OrderVisitor. It produces a slightly coarsened graph to drive the
|
// OrderVisitor. It produces a slightly coarsened graph to drive the
|
||||||
// code scheduling.
|
// code scheduling.
|
||||||
//
|
//
|
||||||
// * The new graph contains nodes of type OrderMoveVertex.
|
// * For the serial code scheduler, the new graph contains
|
||||||
|
// nodes of type OrderMoveVertex.
|
||||||
|
//
|
||||||
|
// * For the threaded code scheduler, the new graph contains
|
||||||
|
// nodes of type MTaskMoveVertex.
|
||||||
//
|
//
|
||||||
// * The difference in output type is abstracted away by the
|
// * The difference in output type is abstracted away by the
|
||||||
// 'T_MoveVertex' template parameter.
|
// 'T_MoveVertex' template parameter; ProcessMoveBuildGraph otherwise
|
||||||
|
// works the same way for both cases.
|
||||||
|
|
||||||
// TYPES
|
// TYPES
|
||||||
typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
|
typedef std::pair<const V3GraphVertex*, const AstSenTree*> VxDomPair;
|
||||||
|
|
@ -563,7 +571,7 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
// OrderMoveVertexMaker
|
// OrderMoveVertexMaker and related
|
||||||
|
|
||||||
class OrderMoveVertexMaker
|
class OrderMoveVertexMaker
|
||||||
: public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
|
: public ProcessMoveBuildGraph<OrderMoveVertex>::MoveVertexMaker {
|
||||||
|
|
@ -595,6 +603,64 @@ private:
|
||||||
VL_UNCOPYABLE(OrderMoveVertexMaker);
|
VL_UNCOPYABLE(OrderMoveVertexMaker);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class OrderMTaskMoveVertexMaker
|
||||||
|
: public ProcessMoveBuildGraph<MTaskMoveVertex>::MoveVertexMaker {
|
||||||
|
V3Graph* m_pomGraphp;
|
||||||
|
public:
|
||||||
|
explicit OrderMTaskMoveVertexMaker(V3Graph* pomGraphp)
|
||||||
|
: m_pomGraphp(pomGraphp) {}
|
||||||
|
MTaskMoveVertex* makeVertexp(OrderLogicVertex* lvertexp,
|
||||||
|
const OrderEitherVertex* varVertexp,
|
||||||
|
const AstScope* scopep,
|
||||||
|
const AstSenTree* domainp) {
|
||||||
|
// Exclude initial/settle logic from the mtasks graph.
|
||||||
|
// We'll output time-zero logic separately.
|
||||||
|
if (domainp->hasInitial() || domainp->hasSettle()) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return new MTaskMoveVertex(m_pomGraphp, lvertexp, varVertexp, scopep, domainp);
|
||||||
|
}
|
||||||
|
void freeVertexp(MTaskMoveVertex* freeMep) {
|
||||||
|
freeMep->unlinkDelete(m_pomGraphp);
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
VL_UNCOPYABLE(OrderMTaskMoveVertexMaker);
|
||||||
|
};
|
||||||
|
|
||||||
|
class OrderVerticesByDomainThenScope {
|
||||||
|
PartPtrIdMap m_ids;
|
||||||
|
public:
|
||||||
|
virtual bool operator()(const V3GraphVertex* lhsp,
|
||||||
|
const V3GraphVertex* rhsp) const {
|
||||||
|
const MTaskMoveVertex* l_vxp = dynamic_cast<const MTaskMoveVertex*>(lhsp);
|
||||||
|
const MTaskMoveVertex* r_vxp = dynamic_cast<const MTaskMoveVertex*>(rhsp);
|
||||||
|
vluint64_t l_id = m_ids.findId(l_vxp->domainp());
|
||||||
|
vluint64_t r_id = m_ids.findId(r_vxp->domainp());
|
||||||
|
if (l_id < r_id) return true;
|
||||||
|
if (l_id > r_id) return false;
|
||||||
|
l_id = m_ids.findId(l_vxp->scopep());
|
||||||
|
r_id = m_ids.findId(r_vxp->scopep());
|
||||||
|
return l_id < r_id;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class MTaskVxIdLessThan {
|
||||||
|
public:
|
||||||
|
MTaskVxIdLessThan() {}
|
||||||
|
virtual ~MTaskVxIdLessThan() {}
|
||||||
|
|
||||||
|
// Sort vertex's, which must be AbstractMTask's, into a deterministic
|
||||||
|
// order by comparing their serial IDs.
|
||||||
|
virtual bool operator()(const V3GraphVertex* lhsp,
|
||||||
|
const V3GraphVertex* rhsp) const {
|
||||||
|
const AbstractMTask* lmtaskp =
|
||||||
|
dynamic_cast<const AbstractLogicMTask*>(lhsp);
|
||||||
|
const AbstractMTask* rmtaskp =
|
||||||
|
dynamic_cast<const AbstractLogicMTask*>(rhsp);
|
||||||
|
return lmtaskp->id() < rmtaskp->id();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
// Order class functions
|
// Order class functions
|
||||||
|
|
||||||
|
|
@ -701,6 +767,7 @@ private:
|
||||||
void processDomainsIterate(OrderEitherVertex* vertexp);
|
void processDomainsIterate(OrderEitherVertex* vertexp);
|
||||||
void processEdgeReport();
|
void processEdgeReport();
|
||||||
|
|
||||||
|
// processMove* routines schedule serial execution
|
||||||
void processMove();
|
void processMove();
|
||||||
void processMoveClear();
|
void processMoveClear();
|
||||||
void processMoveBuildGraph();
|
void processMoveBuildGraph();
|
||||||
|
|
@ -711,6 +778,18 @@ private:
|
||||||
AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
|
AstActive* processMoveOneLogic(const OrderLogicVertex* lvertexp,
|
||||||
AstCFunc*& newFuncpr, int& newStmtsr);
|
AstCFunc*& newFuncpr, int& newStmtsr);
|
||||||
|
|
||||||
|
// processMTask* routines schedule threaded execution
|
||||||
|
struct MTaskState {
|
||||||
|
typedef std::list<const OrderLogicVertex*> Logics;
|
||||||
|
AstMTaskBody* m_mtaskBodyp;
|
||||||
|
Logics m_logics;
|
||||||
|
ExecMTask* m_execMTaskp;
|
||||||
|
MTaskState() : m_mtaskBodyp(NULL), m_execMTaskp(NULL) {}
|
||||||
|
};
|
||||||
|
void processMTasks();
|
||||||
|
typedef enum {LOGIC_INITIAL, LOGIC_SETTLE} InitialLogicE;
|
||||||
|
void processMTasksInitial(InitialLogicE logic_type);
|
||||||
|
|
||||||
string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
|
string cfuncName(AstNodeModule* modp, AstSenTree* domainp, AstScope* scopep, AstNode* forWhatp) {
|
||||||
modp->user3Inc();
|
modp->user3Inc();
|
||||||
int funcnum = modp->user3();
|
int funcnum = modp->user3();
|
||||||
|
|
@ -1726,6 +1805,173 @@ AstActive* OrderVisitor::processMoveOneLogic(const OrderLogicVertex* lvertexp,
|
||||||
return activep;
|
return activep;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void OrderVisitor::processMTasksInitial(InitialLogicE logic_type) {
|
||||||
|
// Emit initial/settle logic. Initial blocks won't be part of the
|
||||||
|
// mtask partition, aren't eligible for parallelism.
|
||||||
|
//
|
||||||
|
int initStmts = 0;
|
||||||
|
AstCFunc* initCFunc = NULL;
|
||||||
|
AstScope* lastScopep = NULL;
|
||||||
|
for (V3GraphVertex* initVxp = m_graph.verticesBeginp();
|
||||||
|
initVxp; initVxp = initVxp->verticesNextp()) {
|
||||||
|
OrderLogicVertex* initp = dynamic_cast<OrderLogicVertex*>(initVxp);
|
||||||
|
if (!initp) continue;
|
||||||
|
if ((logic_type == LOGIC_INITIAL)
|
||||||
|
&& !initp->domainp()->hasInitial()) continue;
|
||||||
|
if ((logic_type == LOGIC_SETTLE)
|
||||||
|
&& !initp->domainp()->hasSettle()) continue;
|
||||||
|
if (initp->scopep() != lastScopep) {
|
||||||
|
// Start new cfunc, don't let the cfunc cross scopes
|
||||||
|
initCFunc = NULL;
|
||||||
|
lastScopep = initp->scopep();
|
||||||
|
}
|
||||||
|
AstActive* newActivep = processMoveOneLogic(initp, initCFunc/*ref*/, initStmts/*ref*/);
|
||||||
|
if (newActivep) m_scopetopp->addActivep(newActivep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void OrderVisitor::processMTasks() {
|
||||||
|
// For nondeterminism debug:
|
||||||
|
V3Partition::hashGraphDebug(&m_graph, "V3Order's m_graph");
|
||||||
|
|
||||||
|
processMTasksInitial(LOGIC_INITIAL);
|
||||||
|
processMTasksInitial(LOGIC_SETTLE);
|
||||||
|
|
||||||
|
// We already produced a graph of every var, input, logic, and settle
|
||||||
|
// block and all dependencies; this is 'm_graph'.
|
||||||
|
//
|
||||||
|
// Now, starting from m_graph, make a slightly-coarsened graph representing
|
||||||
|
// only logic, and discarding edges we know we can ignore.
|
||||||
|
// This is quite similar to the 'm_pomGraph' of the serial code gen:
|
||||||
|
V3Graph logicGraph;
|
||||||
|
OrderMTaskMoveVertexMaker create_mtask_vertex(&logicGraph);
|
||||||
|
ProcessMoveBuildGraph<MTaskMoveVertex> mtask_pmbg(
|
||||||
|
&m_graph, &logicGraph, &create_mtask_vertex);
|
||||||
|
mtask_pmbg.build();
|
||||||
|
|
||||||
|
// Needed? We do this for m_pomGraph in serial mode, so do it here too:
|
||||||
|
logicGraph.removeRedundantEdges(&V3GraphEdge::followAlwaysTrue);
|
||||||
|
|
||||||
|
// Partition logicGraph into LogicMTask's. The partitioner will annotate
|
||||||
|
// each vertex in logicGraph with a 'color' which is really an mtask ID
|
||||||
|
// in this context.
|
||||||
|
V3Partition partitioner(&logicGraph);
|
||||||
|
V3Graph mtasks;
|
||||||
|
partitioner.go(&mtasks);
|
||||||
|
|
||||||
|
vl_unordered_map<unsigned /*mtask id*/, MTaskState> mtaskStates;
|
||||||
|
|
||||||
|
// Iterate through the entire logicGraph. For each logic node,
|
||||||
|
// attach it to a per-MTask ordered list of logic nodes.
|
||||||
|
// This is the order we'll execute logic nodes within the MTask.
|
||||||
|
//
|
||||||
|
// MTasks may span scopes and domains, so sort by both here:
|
||||||
|
GraphStream<OrderVerticesByDomainThenScope> emit_logic(&logicGraph);
|
||||||
|
const V3GraphVertex* moveVxp;
|
||||||
|
while ((moveVxp = emit_logic.nextp())) {
|
||||||
|
const MTaskMoveVertex* movep =
|
||||||
|
dynamic_cast<const MTaskMoveVertex*>(moveVxp);
|
||||||
|
unsigned mtaskId = movep->color();
|
||||||
|
UASSERT(mtaskId > 0,
|
||||||
|
"Every MTaskMoveVertex should have an mtask assignment >0");
|
||||||
|
if (movep->logicp()) {
|
||||||
|
// Add this logic to the per-mtask order
|
||||||
|
mtaskStates[mtaskId].m_logics.push_back(movep->logicp());
|
||||||
|
|
||||||
|
// Since we happen to be iterating over every logic node,
|
||||||
|
// take this opportunity to annotate each AstVar with the id's
|
||||||
|
// of mtasks that consume it and produce it. We'll use this
|
||||||
|
// information in V3EmitC when we lay out var's in memory.
|
||||||
|
const OrderLogicVertex* logicp = movep->logicp();
|
||||||
|
for (const V3GraphEdge* edgep = logicp->inBeginp();
|
||||||
|
edgep; edgep = edgep->inNextp()) {
|
||||||
|
const OrderVarVertex* pre_varp =
|
||||||
|
dynamic_cast<const OrderVarVertex*>(edgep->fromp());
|
||||||
|
if (!pre_varp) continue;
|
||||||
|
AstVar* varp = pre_varp->varScp()->varp();
|
||||||
|
// varp depends on logicp, so logicp produces varp,
|
||||||
|
// and vice-versa below
|
||||||
|
varp->addProducingMTaskId(mtaskId);
|
||||||
|
}
|
||||||
|
for (const V3GraphEdge* edgep = logicp->outBeginp();
|
||||||
|
edgep; edgep = edgep->outNextp()) {
|
||||||
|
const OrderVarVertex* post_varp
|
||||||
|
= dynamic_cast<const OrderVarVertex*>(edgep->top());
|
||||||
|
if (!post_varp) continue;
|
||||||
|
AstVar* varp = post_varp->varScp()->varp();
|
||||||
|
varp->addConsumingMTaskId(mtaskId);
|
||||||
|
}
|
||||||
|
// TODO? We ignore IO vars here, so those will have empty mtask
|
||||||
|
// signatures. But we could also give those mtask signatures.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the AstExecGraph node which represents the execution
|
||||||
|
// of the MTask graph.
|
||||||
|
FileLine* rootFlp = new FileLine("AstRoot", 0);
|
||||||
|
AstExecGraph* execGraphp = new AstExecGraph(rootFlp);
|
||||||
|
m_scopetopp->addActivep(execGraphp);
|
||||||
|
v3Global.rootp()->execGraphp(execGraphp);
|
||||||
|
|
||||||
|
// Create CFuncs and bodies for each MTask.
|
||||||
|
GraphStream<MTaskVxIdLessThan> emit_mtasks(&mtasks);
|
||||||
|
const V3GraphVertex* mtaskVxp;
|
||||||
|
while ((mtaskVxp = emit_mtasks.nextp())) {
|
||||||
|
const AbstractLogicMTask* mtaskp =
|
||||||
|
dynamic_cast<const AbstractLogicMTask*>(mtaskVxp);
|
||||||
|
|
||||||
|
// Create a body for this mtask
|
||||||
|
AstMTaskBody* bodyp = new AstMTaskBody(rootFlp);
|
||||||
|
MTaskState& state = mtaskStates[mtaskp->id()];
|
||||||
|
state.m_mtaskBodyp = bodyp;
|
||||||
|
|
||||||
|
// Create leaf CFunc's to run this mtask's logic,
|
||||||
|
// and create a set of AstActive's to call those CFuncs.
|
||||||
|
// Add the AstActive's into the AstMTaskBody.
|
||||||
|
const AstSenTree* last_domainp = NULL;
|
||||||
|
AstCFunc* leafCFuncp = NULL;
|
||||||
|
int leafStmts = 0;
|
||||||
|
for (MTaskState::Logics::iterator it = state.m_logics.begin();
|
||||||
|
it != state.m_logics.end(); ++it) {
|
||||||
|
const OrderLogicVertex* logicp = *it;
|
||||||
|
if (logicp->domainp() != last_domainp) {
|
||||||
|
// Start a new leaf function.
|
||||||
|
leafCFuncp = NULL;
|
||||||
|
}
|
||||||
|
last_domainp = logicp->domainp();
|
||||||
|
|
||||||
|
AstActive* newActivep = processMoveOneLogic(logicp, leafCFuncp/*ref*/, leafStmts/*ref*/);
|
||||||
|
if (newActivep) bodyp->addStmtsp(newActivep);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Translate the LogicMTask graph into the corresponding ExecMTask
|
||||||
|
// graph, which will outlive V3Order and persist for the remainder
|
||||||
|
// of verilator's processing.
|
||||||
|
// - The LogicMTask graph points to MTaskMoveVertex's
|
||||||
|
// and OrderLogicVertex's which are ephemeral to V3Order.
|
||||||
|
// - The ExecMTask graph and the AstMTaskBody's produced here
|
||||||
|
// persist until code generation time.
|
||||||
|
state.m_execMTaskp =
|
||||||
|
new ExecMTask(execGraphp->mutableDepGraphp(),
|
||||||
|
bodyp, mtaskp->id());
|
||||||
|
// Cross-link each ExecMTask and MTaskBody
|
||||||
|
// Q: Why even have two objects?
|
||||||
|
// A: One is an AstNode, the other is a GraphVertex,
|
||||||
|
// to combine them would involve multiple inheritance...
|
||||||
|
state.m_mtaskBodyp->execMTaskp(state.m_execMTaskp);
|
||||||
|
for (V3GraphEdge* inp = mtaskp->inBeginp();
|
||||||
|
inp; inp = inp->inNextp()) {
|
||||||
|
const V3GraphVertex* fromVxp = inp->fromp();
|
||||||
|
const AbstractLogicMTask* fromp =
|
||||||
|
dynamic_cast<const AbstractLogicMTask*>(fromVxp);
|
||||||
|
MTaskState& fromState = mtaskStates[fromp->id()];
|
||||||
|
new V3GraphEdge(execGraphp->mutableDepGraphp(),
|
||||||
|
fromState.m_execMTaskp, state.m_execMTaskp, 1);
|
||||||
|
}
|
||||||
|
execGraphp->addMTaskBody(bodyp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
// OrderVisitor - Top processing
|
// OrderVisitor - Top processing
|
||||||
|
|
||||||
|
|
@ -1762,7 +2008,7 @@ void OrderVisitor::process() {
|
||||||
|
|
||||||
if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
|
if (debug() && v3Global.opt.dumpTree()) processEdgeReport();
|
||||||
|
|
||||||
{
|
if (!v3Global.opt.mtasks()) {
|
||||||
UINFO(2," Construct Move Graph...\n");
|
UINFO(2," Construct Move Graph...\n");
|
||||||
processMoveBuildGraph();
|
processMoveBuildGraph();
|
||||||
if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start"); // Different prefix (ordermv) as it's not the same graph
|
if (debug()>=4) m_pomGraph.dumpDotFilePrefixed("ordermv_start"); // Different prefix (ordermv) as it's not the same graph
|
||||||
|
|
@ -1771,6 +2017,9 @@ void OrderVisitor::process() {
|
||||||
|
|
||||||
UINFO(2," Move...\n");
|
UINFO(2," Move...\n");
|
||||||
processMove();
|
processMove();
|
||||||
|
} else {
|
||||||
|
UINFO(2," Set up mtasks...\n");
|
||||||
|
processMTasks();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
|
// Any SC inputs feeding a combo domain must be marked, so we can make them sc_sensitive
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@
|
||||||
//
|
//
|
||||||
// V3GraphVertex
|
// V3GraphVertex
|
||||||
// OrderMoveVertex
|
// OrderMoveVertex
|
||||||
|
// MTaskMoveVertex
|
||||||
// OrderEitherVertex
|
// OrderEitherVertex
|
||||||
// OrderInputsVertex
|
// OrderInputsVertex
|
||||||
// OrderSettleVertex
|
// OrderSettleVertex
|
||||||
|
|
@ -47,6 +48,7 @@
|
||||||
#include "verilatedos.h"
|
#include "verilatedos.h"
|
||||||
#include "V3Ast.h"
|
#include "V3Ast.h"
|
||||||
#include "V3Graph.h"
|
#include "V3Graph.h"
|
||||||
|
#include VL_INCLUDE_UNORDERED_MAP
|
||||||
|
|
||||||
class OrderVisitor;
|
class OrderVisitor;
|
||||||
class OrderMoveVertex;
|
class OrderMoveVertex;
|
||||||
|
|
@ -363,6 +365,57 @@ public:
|
||||||
void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
|
void domScopep(OrderMoveDomScope* ds) { m_domScopep=ds; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Similar to OrderMoveVertex, but modified for threaded code generation.
|
||||||
|
class MTaskMoveVertex : public V3GraphVertex {
|
||||||
|
// This could be more compact, since we know m_varp and m_logicp
|
||||||
|
// cannot both be set. Each MTaskMoveVertex represents a logic node
|
||||||
|
// or a var node, it can't be both.
|
||||||
|
OrderLogicVertex* m_logicp; // Logic represented by this vertex
|
||||||
|
const OrderEitherVertex* m_varp; // Var represented by this vertex
|
||||||
|
const AstScope* m_scopep;
|
||||||
|
const AstSenTree* m_domainp;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
friend class OrderVisitor;
|
||||||
|
friend class MTaskMoveVertexMaker;
|
||||||
|
public:
|
||||||
|
MTaskMoveVertex(V3Graph* graphp, OrderLogicVertex* logicp,
|
||||||
|
const OrderEitherVertex* varp,
|
||||||
|
const AstScope* scopep, const AstSenTree* domainp)
|
||||||
|
: V3GraphVertex(graphp), m_logicp(logicp),
|
||||||
|
m_varp(varp), m_scopep(scopep), m_domainp(domainp) {
|
||||||
|
UASSERT(!(logicp && varp),
|
||||||
|
"MTaskMoveVertex: logicp and varp may not both be set!\n");
|
||||||
|
}
|
||||||
|
virtual ~MTaskMoveVertex() {}
|
||||||
|
virtual MTaskMoveVertex* clone(V3Graph* graphp) const {
|
||||||
|
v3fatalSrc("Unsupported"); return NULL; }
|
||||||
|
virtual OrderVEdgeType type() const { return OrderVEdgeType::VERTEX_MOVE; }
|
||||||
|
virtual string dotColor() const {
|
||||||
|
if (logicp()) return logicp()->dotColor();
|
||||||
|
else return "yellow";
|
||||||
|
}
|
||||||
|
virtual string name() const {
|
||||||
|
string nm;
|
||||||
|
if (logicp()) {
|
||||||
|
nm = logicp()->name();
|
||||||
|
nm += (string("\\nMV:")
|
||||||
|
+" d="+cvtToStr((void*)logicp()->domainp())
|
||||||
|
+" s="+cvtToStr((void*)logicp()->scopep())
|
||||||
|
// "color()" represents the mtask ID.
|
||||||
|
+"\\nt="+cvtToStr(color()));
|
||||||
|
} else {
|
||||||
|
nm = "nolog\\nt="+cvtToStr(color());
|
||||||
|
}
|
||||||
|
return nm;
|
||||||
|
}
|
||||||
|
// ACCESSORS
|
||||||
|
OrderLogicVertex* logicp() const { return m_logicp; }
|
||||||
|
const OrderEitherVertex* varp() const { return m_varp; }
|
||||||
|
const AstScope* scopep() const { return m_scopep; }
|
||||||
|
const AstSenTree* domainp() const { return m_domainp; }
|
||||||
|
};
|
||||||
|
|
||||||
//######################################################################
|
//######################################################################
|
||||||
// Edge types
|
// Edge types
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,99 @@
|
||||||
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||||
|
//*************************************************************************
|
||||||
|
// DESCRIPTION: Verilator: Threading's logic to mtask partitioner
|
||||||
|
//
|
||||||
|
// Code available from: http://www.veripool.org/verilator
|
||||||
|
//
|
||||||
|
//*************************************************************************
|
||||||
|
//
|
||||||
|
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
// redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
// Version 2.0.
|
||||||
|
//
|
||||||
|
// Verilator is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
//*************************************************************************
|
||||||
|
|
||||||
|
#ifndef _V3PARTITION_H_
|
||||||
|
#define _V3PARTITION_H_
|
||||||
|
|
||||||
|
#include "config_build.h"
|
||||||
|
#include "verilatedos.h"
|
||||||
|
#include <list>
|
||||||
|
|
||||||
|
#include "V3Graph.h"
|
||||||
|
#include "V3OrderGraph.h"
|
||||||
|
|
||||||
|
class LogicMTask;
|
||||||
|
typedef vl_unordered_map<const MTaskMoveVertex*, LogicMTask*> Vx2MTaskMap;
|
||||||
|
|
||||||
|
//*************************************************************************
|
||||||
|
/// V3Partition takes the fine-grained logic graph from V3Order and
|
||||||
|
/// collapses it into a coarse-grained graph of AbstractLogicMTask's, each
|
||||||
|
/// of which contains of set of the logic nodes from the fine-grained
|
||||||
|
/// graph.
|
||||||
|
|
||||||
|
class V3Partition {
|
||||||
|
// MEMBERS
|
||||||
|
V3Graph* m_fineDepsGraphp; // Fine-grained dependency graph
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
explicit V3Partition(V3Graph* fineDepsGraphp)
|
||||||
|
: m_fineDepsGraphp(fineDepsGraphp) {}
|
||||||
|
~V3Partition() {}
|
||||||
|
|
||||||
|
// METHODS
|
||||||
|
|
||||||
|
// Fill in the provided empty graph with AbstractLogicMTask's and their
|
||||||
|
// interdependencies.
|
||||||
|
void go(V3Graph* mtasksp);
|
||||||
|
|
||||||
|
static void selfTest();
|
||||||
|
|
||||||
|
// Print out a hash of the shape of graphp. Only needed to debug the
|
||||||
|
// origin of some nondeterminism; otherwise this is pretty useless.
|
||||||
|
static void hashGraphDebug(const V3Graph* graphp, const char* debugName);
|
||||||
|
|
||||||
|
// Print debug stats about graphp whose nodes must be AbstractMTask's.
|
||||||
|
static void debugMTaskGraphStats(const V3Graph* graphp, const string& name);
|
||||||
|
|
||||||
|
// Operate on the final ExecMTask graph, immediately prior to code
|
||||||
|
// generation time.
|
||||||
|
static void finalize();
|
||||||
|
private:
|
||||||
|
static void finalizeCosts(V3Graph* execMTaskGraphp);
|
||||||
|
static void setupMTaskDeps(V3Graph* mtasksp, const Vx2MTaskMap* vx2mtaskp);
|
||||||
|
|
||||||
|
VL_DEBUG_FUNC; // Declare debug()
|
||||||
|
VL_UNCOPYABLE(V3Partition);
|
||||||
|
};
|
||||||
|
|
||||||
|
//*************************************************************************
|
||||||
|
// Map a pointer into a id, for e.g. nodep to mtask mappings
|
||||||
|
|
||||||
|
class PartPtrIdMap {
|
||||||
|
private:
|
||||||
|
// TYPES
|
||||||
|
typedef vl_unordered_map <const void*, vluint64_t> PtrMap;
|
||||||
|
// MEMBERS
|
||||||
|
mutable vluint64_t m_nextId;
|
||||||
|
mutable PtrMap m_id;
|
||||||
|
public:
|
||||||
|
// CONSTRUCTORS
|
||||||
|
PartPtrIdMap() : m_nextId(0) {}
|
||||||
|
// METHODS
|
||||||
|
vluint64_t findId(const void* ptrp) const {
|
||||||
|
PtrMap::iterator it = m_id.find(ptrp);
|
||||||
|
if (it != m_id.end()) {
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
m_id[ptrp] = m_nextId;
|
||||||
|
return m_nextId++;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // Guard
|
||||||
|
|
@ -0,0 +1,108 @@
|
||||||
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||||
|
//*************************************************************************
|
||||||
|
// DESCRIPTION: Verilator: Threading's graph structures
|
||||||
|
//
|
||||||
|
// Code available from: http://www.veripool.org/verilator
|
||||||
|
//
|
||||||
|
//*************************************************************************
|
||||||
|
//
|
||||||
|
// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
// redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
// Version 2.0.
|
||||||
|
//
|
||||||
|
// Verilator is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
//*************************************************************************
|
||||||
|
|
||||||
|
#ifndef _V3PARTITIONGRAPH_H_
|
||||||
|
#define _V3PARTITIONGRAPH_H_
|
||||||
|
|
||||||
|
#include "config_build.h"
|
||||||
|
#include "verilatedos.h"
|
||||||
|
#include <list>
|
||||||
|
|
||||||
|
#include "V3Graph.h"
|
||||||
|
#include "V3OrderGraph.h"
|
||||||
|
|
||||||
|
//*************************************************************************
|
||||||
|
// MTasks and graph structures
|
||||||
|
|
||||||
|
class AbstractMTask : public V3GraphVertex {
|
||||||
|
public:
|
||||||
|
AbstractMTask(V3Graph* graphp) : V3GraphVertex(graphp) {}
|
||||||
|
virtual ~AbstractMTask() {}
|
||||||
|
virtual uint32_t id() const = 0;
|
||||||
|
virtual uint32_t cost() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class AbstractLogicMTask : public AbstractMTask {
|
||||||
|
public:
|
||||||
|
// TYPES
|
||||||
|
typedef std::list<MTaskMoveVertex*> VxList;
|
||||||
|
// CONSTRUCTORS
|
||||||
|
AbstractLogicMTask(V3Graph* graphp) : AbstractMTask(graphp) {}
|
||||||
|
virtual ~AbstractLogicMTask() {}
|
||||||
|
// METHODS
|
||||||
|
// Set of logic vertices in this mtask. Order is not significant.
|
||||||
|
virtual const VxList* vertexListp() const = 0;
|
||||||
|
virtual uint32_t id() const = 0; // Unique id of this mtask.
|
||||||
|
virtual uint32_t cost() const = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class ExecMTask : public AbstractMTask {
|
||||||
|
private:
|
||||||
|
AstMTaskBody* m_bodyp; // Task body
|
||||||
|
uint32_t m_id; // Unique id of this mtask.
|
||||||
|
uint32_t m_priority; // Predicted critical path from the start of
|
||||||
|
// this mtask to the ends of the graph that are reachable from this
|
||||||
|
// mtask. In abstract time units.
|
||||||
|
uint32_t m_cost; // Predicted runtime of this mtask, in the same
|
||||||
|
// abstract time units as priority().
|
||||||
|
uint32_t m_thread; // Thread for static (pack_mtasks) scheduling,
|
||||||
|
// or 0xffffffff if not yet assigned.
|
||||||
|
const ExecMTask* m_packNextp; // Next for static (pack_mtasks) scheduling
|
||||||
|
bool m_threadRoot; // Is root thread
|
||||||
|
VL_UNCOPYABLE(ExecMTask);
|
||||||
|
public:
|
||||||
|
ExecMTask(V3Graph* graphp, AstMTaskBody* bodyp, uint32_t id)
|
||||||
|
: AbstractMTask(graphp),
|
||||||
|
m_bodyp(bodyp),
|
||||||
|
m_id(id),
|
||||||
|
m_priority(0),
|
||||||
|
m_cost(0),
|
||||||
|
m_thread(0xffffffff),
|
||||||
|
m_packNextp(NULL),
|
||||||
|
m_threadRoot(false) {}
|
||||||
|
AstMTaskBody* bodyp() const { return m_bodyp; }
|
||||||
|
virtual uint32_t id() const { return m_id; }
|
||||||
|
uint32_t priority() const { return m_priority; }
|
||||||
|
void priority(uint32_t pri) { m_priority = pri; }
|
||||||
|
virtual uint32_t cost() const { return m_cost; }
|
||||||
|
void cost(uint32_t cost) { m_cost = cost; }
|
||||||
|
void thread(uint32_t thread) { m_thread = thread; }
|
||||||
|
uint32_t thread() const { return m_thread; }
|
||||||
|
void packNextp(const ExecMTask* nextp) { m_packNextp = nextp; }
|
||||||
|
const ExecMTask* packNextp() const { return m_packNextp; }
|
||||||
|
bool threadRoot() const { return m_threadRoot; }
|
||||||
|
void threadRoot(bool threadRoot) { m_threadRoot = threadRoot; }
|
||||||
|
string cFuncName() const {
|
||||||
|
// If this MTask maps to a C function, this should be the name
|
||||||
|
return string("__Vmtask")+"__"+cvtToStr(m_id);
|
||||||
|
}
|
||||||
|
string name() const { return string("mt")+cvtToStr(id()); }
|
||||||
|
void dump(std::ostream& str) const {
|
||||||
|
str <<name()<<"."<<((void*)this);
|
||||||
|
if (priority() || cost()) str <<" [pr="<<priority()<<" c="<<cvtToStr(cost())<<"]";
|
||||||
|
if (thread() != 0xffffffff) str <<" th="<<thread();
|
||||||
|
if (threadRoot()) str <<" [ROOT]";
|
||||||
|
if (packNextp()) str <<" nx="<<packNextp()->name();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
inline std::ostream& operator<<(std::ostream& os, const ExecMTask& rhs) {
|
||||||
|
rhs.dump(os); return os; }
|
||||||
|
|
||||||
|
#endif // Guard
|
||||||
|
|
@ -182,6 +182,7 @@ private:
|
||||||
AstNode* m_chgSubParentp;// Which node has call to m_chgSubFuncp
|
AstNode* m_chgSubParentp;// Which node has call to m_chgSubFuncp
|
||||||
int m_chgSubStmts; // Statements under function being built
|
int m_chgSubStmts; // Statements under function being built
|
||||||
AstVarScope* m_activityVscp; // Activity variable
|
AstVarScope* m_activityVscp; // Activity variable
|
||||||
|
uint32_t m_activityNumber; // Count of fields in activity variable
|
||||||
uint32_t m_code; // Trace ident code# being assigned
|
uint32_t m_code; // Trace ident code# being assigned
|
||||||
V3Graph m_graph; // Var/CFunc tracking
|
V3Graph m_graph; // Var/CFunc tracking
|
||||||
TraceActivityVertex* m_alwaysVtxp; // "Always trace" vertex
|
TraceActivityVertex* m_alwaysVtxp; // "Always trace" vertex
|
||||||
|
|
@ -297,7 +298,7 @@ private:
|
||||||
|
|
||||||
void assignActivity() {
|
void assignActivity() {
|
||||||
// Select activity numbers and put into each CFunc vertex
|
// Select activity numbers and put into each CFunc vertex
|
||||||
uint32_t activityNumber = 1; // Note 0 indicates "slow"
|
m_activityNumber = 1; // Note 0 indicates "slow"
|
||||||
for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
|
for (V3GraphVertex* itp = m_graph.verticesBeginp(); itp; itp=itp->verticesNextp()) {
|
||||||
if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
|
if (TraceActivityVertex* vvertexp = dynamic_cast<TraceActivityVertex*>(itp)) {
|
||||||
if (!vvertexp->activityCodeValid()) {
|
if (!vvertexp->activityCodeValid()) {
|
||||||
|
|
@ -306,17 +307,39 @@ private:
|
||||||
// This makes us need less activityNumbers and so speeds up the fast path.
|
// This makes us need less activityNumbers and so speeds up the fast path.
|
||||||
vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
|
vvertexp->activityCode(TraceActivityVertex::ACTIVITY_SLOW);
|
||||||
} else {
|
} else {
|
||||||
vvertexp->activityCode(activityNumber++);
|
vvertexp->activityCode(m_activityNumber++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert global variable
|
AstVar* newvarp;
|
||||||
if (!activityNumber) activityNumber++; // For simplicity, always create it
|
if (v3Global.opt.mtasks()) {
|
||||||
int activityBits = VL_WORDS_I(activityNumber)*VL_WORDSIZE; // For tighter code; round to next 32 bit point.
|
// Create a vector of bytes, not bits, for the tracing vector,
|
||||||
AstVar* newvarp = new AstVar (m_chgFuncp->fileline(), AstVarType::MODULETEMP,
|
// so that we can set them atomically without locking.
|
||||||
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
|
//
|
||||||
|
// TODO: It would be slightly faster to have a bit vector per
|
||||||
|
// chain of packed MTasks, but we haven't packed the MTasks yet.
|
||||||
|
// If we support fully threaded tracing in the future, it would
|
||||||
|
// make sense to improve this at that time.
|
||||||
|
AstNodeDType* newScalarDtp
|
||||||
|
= new AstBasicDType(m_chgFuncp->fileline(), VFlagLogicPacked(), 1);
|
||||||
|
v3Global.rootp()->typeTablep()->addTypesp(newScalarDtp);
|
||||||
|
AstNodeDType* newArrDtp = new AstUnpackArrayDType(
|
||||||
|
m_chgFuncp->fileline(),
|
||||||
|
newScalarDtp,
|
||||||
|
new AstRange(m_chgFuncp->fileline(),
|
||||||
|
VNumRange(m_activityNumber-1, 0, false)));
|
||||||
|
v3Global.rootp()->typeTablep()->addTypesp(newArrDtp);
|
||||||
|
newvarp = new AstVar(m_chgFuncp->fileline(),
|
||||||
|
AstVarType::MODULETEMP,
|
||||||
|
"__Vm_traceActivity", newArrDtp);
|
||||||
|
} else {
|
||||||
|
// For tighter code; round to next 32 bit point.
|
||||||
|
int activityBits = VL_WORDS_I(m_activityNumber)*VL_WORDSIZE;
|
||||||
|
newvarp = new AstVar(m_chgFuncp->fileline(), AstVarType::MODULETEMP,
|
||||||
|
"__Vm_traceActivity", VFlagBitPacked(), activityBits);
|
||||||
|
}
|
||||||
m_topModp->addStmtp(newvarp);
|
m_topModp->addStmtp(newvarp);
|
||||||
AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
|
AstVarScope* newvscp = new AstVarScope(newvarp->fileline(), m_highScopep, newvarp);
|
||||||
m_highScopep->addVarp(newvscp);
|
m_highScopep->addVarp(newvscp);
|
||||||
|
|
@ -329,15 +352,23 @@ private:
|
||||||
FileLine* fl = vvertexp->insertp()->fileline();
|
FileLine* fl = vvertexp->insertp()->fileline();
|
||||||
uint32_t acode = vvertexp->activityCode();
|
uint32_t acode = vvertexp->activityCode();
|
||||||
vvertexp->insertp()->addNextHere
|
vvertexp->insertp()->addNextHere
|
||||||
(new AstAssign (fl,
|
(new AstAssign(fl, selectActivity(fl, acode, true),
|
||||||
new AstSel (fl, new AstVarRef(fl, m_activityVscp, true),
|
new AstConst(fl, AstConst::LogicTrue())));
|
||||||
acode, 1),
|
|
||||||
new AstConst (fl, AstConst::LogicTrue())));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AstNode* selectActivity(FileLine* flp, uint32_t acode, bool lvalue) {
|
||||||
|
if (v3Global.opt.mtasks()) {
|
||||||
|
return new AstArraySel(
|
||||||
|
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode);
|
||||||
|
} else {
|
||||||
|
return new AstSel(
|
||||||
|
flp, new AstVarRef(flp, m_activityVscp, lvalue), acode, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
|
AstCFunc* newCFunc(AstCFuncType type, const string& name, AstCFunc* basep) {
|
||||||
AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
|
AstCFunc* funcp = new AstCFunc(basep->fileline(), name, basep->scopep());
|
||||||
funcp->slow(basep->slow());
|
funcp->slow(basep->slow());
|
||||||
|
|
@ -453,8 +484,7 @@ private:
|
||||||
AstNode* condp = NULL;
|
AstNode* condp = NULL;
|
||||||
for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
|
for (ActCodeSet::const_iterator csit = actset.begin(); csit!=actset.end(); ++csit) {
|
||||||
uint32_t acode = *csit;
|
uint32_t acode = *csit;
|
||||||
AstNode* selp = new AstSel (fl, new AstVarRef(fl, m_activityVscp, false),
|
AstNode* selp = selectActivity(fl, acode, false);
|
||||||
acode, 1);
|
|
||||||
if (condp) condp = new AstOr (fl, condp, selp);
|
if (condp) condp = new AstOr (fl, condp, selp);
|
||||||
else condp = selp;
|
else condp = selp;
|
||||||
}
|
}
|
||||||
|
|
@ -473,11 +503,19 @@ private:
|
||||||
|
|
||||||
// Clear activity after tracing completes
|
// Clear activity after tracing completes
|
||||||
FileLine* fl = m_chgFuncp->fileline();
|
FileLine* fl = m_chgFuncp->fileline();
|
||||||
AstNode* clrp = new AstAssign (fl,
|
if (v3Global.opt.mtasks()) {
|
||||||
new AstVarRef(fl, m_activityVscp, true),
|
for (uint32_t i = 0; i < m_activityNumber; ++i) {
|
||||||
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
|
AstNode* clrp = new AstAssign(fl, selectActivity(fl, i, true),
|
||||||
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
new AstConst(fl, AstConst::LogicFalse()));
|
||||||
m_chgFuncp->addFinalsp(clrp);
|
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
||||||
|
m_chgFuncp->addFinalsp(clrp);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
AstNode* clrp = new AstAssign(fl, new AstVarRef(fl, m_activityVscp, true),
|
||||||
|
new AstConst(fl, V3Number(fl, m_activityVscp->width())));
|
||||||
|
m_fullFuncp->addFinalsp(clrp->cloneTree(true));
|
||||||
|
m_chgFuncp->addFinalsp(clrp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t assignDeclCode(AstTraceDecl* nodep) {
|
uint32_t assignDeclCode(AstTraceDecl* nodep) {
|
||||||
|
|
@ -699,6 +737,7 @@ public:
|
||||||
m_chgSubFuncp = NULL;
|
m_chgSubFuncp = NULL;
|
||||||
m_chgSubParentp = NULL;
|
m_chgSubParentp = NULL;
|
||||||
m_chgSubStmts = 0;
|
m_chgSubStmts = 0;
|
||||||
|
m_activityNumber = 0;
|
||||||
m_code = 0;
|
m_code = 0;
|
||||||
m_finding = false;
|
m_finding = false;
|
||||||
m_funcNum = 0;
|
m_funcNum = 0;
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,7 @@
|
||||||
#include "V3Param.h"
|
#include "V3Param.h"
|
||||||
#include "V3Parse.h"
|
#include "V3Parse.h"
|
||||||
#include "V3ParseSym.h"
|
#include "V3ParseSym.h"
|
||||||
|
#include "V3Partition.h"
|
||||||
#include "V3PreShell.h"
|
#include "V3PreShell.h"
|
||||||
#include "V3Premit.h"
|
#include "V3Premit.h"
|
||||||
#include "V3Reloop.h"
|
#include "V3Reloop.h"
|
||||||
|
|
@ -524,6 +525,14 @@ void process () {
|
||||||
V3EmitC::emitcSyms();
|
V3EmitC::emitcSyms();
|
||||||
V3EmitC::emitcTrace();
|
V3EmitC::emitcTrace();
|
||||||
}
|
}
|
||||||
|
if (!v3Global.opt.xmlOnly()
|
||||||
|
&& v3Global.opt.mtasks()) {
|
||||||
|
// Finalize our MTask cost estimates and pack the mtasks into
|
||||||
|
// threads. Must happen pre-EmitC which relies on the packing
|
||||||
|
// order. Must happen post-V3LifePost which changes the relative
|
||||||
|
// costs of mtasks.
|
||||||
|
V3Partition::finalize();
|
||||||
|
}
|
||||||
if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
|
if (!v3Global.opt.xmlOnly()) { // Unfortunately we have some lint checks in emitc.
|
||||||
V3EmitC::emitc();
|
V3EmitC::emitc();
|
||||||
}
|
}
|
||||||
|
|
@ -607,8 +616,11 @@ int main(int argc, char** argv, char** env) {
|
||||||
VHashSha1::selfTest();
|
VHashSha1::selfTest();
|
||||||
AstBasicDTypeKwd::selfTest();
|
AstBasicDTypeKwd::selfTest();
|
||||||
V3Graph::selfTest();
|
V3Graph::selfTest();
|
||||||
V3TSP::selfTest();
|
if (v3Global.opt.debugSelfTest()) {
|
||||||
V3ScoreboardBase::selfTest();
|
V3TSP::selfTest();
|
||||||
|
V3ScoreboardBase::selfTest();
|
||||||
|
V3Partition::selfTest();
|
||||||
|
}
|
||||||
|
|
||||||
// Read first filename
|
// Read first filename
|
||||||
v3Global.readFiles();
|
v3Global.readFiles();
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ endif
|
||||||
|
|
||||||
.PHONY: test
|
.PHONY: test
|
||||||
test:
|
test:
|
||||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --dist
|
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --vltmt --dist
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
|
|
@ -61,6 +61,9 @@ nc:
|
||||||
vlt:
|
vlt:
|
||||||
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
|
$(PERL) driver.pl $(DRIVER_FLAGS) --vlt --stop
|
||||||
|
|
||||||
|
vltmt:
|
||||||
|
$(PERL) driver.pl $(DRIVER_FLAGS) --vltmt --stop
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
|
|
||||||
random:
|
random:
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,7 @@ our %All_Scenarios
|
||||||
nc => ["simulator", "nc"],
|
nc => ["simulator", "nc"],
|
||||||
vcs => ["simulator", "vcs"],
|
vcs => ["simulator", "vcs"],
|
||||||
vlt => ["simulator", "vlt_all", "vlt"],
|
vlt => ["simulator", "vlt_all", "vlt"],
|
||||||
|
vltmt => ["simulator", "vlt_all", "vltmt"],
|
||||||
);
|
);
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
@ -104,6 +105,7 @@ if (! GetOptions (
|
||||||
"ms!" => sub { $opt_scenarios{ms} = $_[1]; },
|
"ms!" => sub { $opt_scenarios{ms} = $_[1]; },
|
||||||
"nc!" => sub { $opt_scenarios{nc} = $_[1]; },
|
"nc!" => sub { $opt_scenarios{nc} = $_[1]; },
|
||||||
"vlt!" => sub { $opt_scenarios{vlt} = $_[1]; },
|
"vlt!" => sub { $opt_scenarios{vlt} = $_[1]; },
|
||||||
|
"vltmt!" => sub { $opt_scenarios{vltmt} = $_[1]; },
|
||||||
"vcs!" => sub { $opt_scenarios{vcs} = $_[1]; },
|
"vcs!" => sub { $opt_scenarios{vcs} = $_[1]; },
|
||||||
"<>" => \¶meter,
|
"<>" => \¶meter,
|
||||||
)) {
|
)) {
|
||||||
|
|
@ -322,6 +324,7 @@ sub new {
|
||||||
$self->{scenario} ||= "ghdl" if $self->{ghdl};
|
$self->{scenario} ||= "ghdl" if $self->{ghdl};
|
||||||
$self->{scenario} ||= "vcs" if $self->{vcs};
|
$self->{scenario} ||= "vcs" if $self->{vcs};
|
||||||
$self->{scenario} ||= "vlt" if $self->{vlt};
|
$self->{scenario} ||= "vlt" if $self->{vlt};
|
||||||
|
$self->{scenario} ||= "vltmt" if $self->{vltmt};
|
||||||
$self->{scenario} ||= "nc" if $self->{nc};
|
$self->{scenario} ||= "nc" if $self->{nc};
|
||||||
$self->{scenario} ||= "ms" if $self->{ms};
|
$self->{scenario} ||= "ms" if $self->{ms};
|
||||||
$self->{scenario} ||= "iv" if $self->{iv};
|
$self->{scenario} ||= "iv" if $self->{iv};
|
||||||
|
|
@ -407,6 +410,7 @@ sub new {
|
||||||
ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
|
ms_run_flags => [split(/\s+/,"-lib $self->{obj_dir}/work -c -do 'run -all;quit' ")],
|
||||||
# Verilator
|
# Verilator
|
||||||
vlt => 0,
|
vlt => 0,
|
||||||
|
vltmt => 0,
|
||||||
verilator_flags => ["-cc",
|
verilator_flags => ["-cc",
|
||||||
"-Mdir $self->{obj_dir}",
|
"-Mdir $self->{obj_dir}",
|
||||||
"-OD", # As currently disabled unless -O3
|
"-OD", # As currently disabled unless -O3
|
||||||
|
|
@ -420,7 +424,7 @@ sub new {
|
||||||
%$self};
|
%$self};
|
||||||
bless $self, $class;
|
bless $self, $class;
|
||||||
|
|
||||||
$self->{vlt_all} = $self->{vlt}; # Any Verilator scenario
|
$self->{vlt_all} = $self->{vlt} || $self->{vltmt}; # Any Verilator scenario
|
||||||
|
|
||||||
$self->{VM_PREFIX} ||= "V".$self->{name};
|
$self->{VM_PREFIX} ||= "V".$self->{name};
|
||||||
$self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
|
$self->{stats} ||= "$self->{obj_dir}/V".$self->{name}."__stats.txt";
|
||||||
|
|
@ -593,6 +597,8 @@ sub compile_vlt_flags {
|
||||||
unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
|
unshift @verilator_flags, "--gdbbt" if $opt_gdbbt;
|
||||||
unshift @verilator_flags, "--x-assign unique"; # More likely to be buggy
|
unshift @verilator_flags, "--x-assign unique"; # More likely to be buggy
|
||||||
unshift @verilator_flags, "--trace" if $opt_trace;
|
unshift @verilator_flags, "--trace" if $opt_trace;
|
||||||
|
unshift @verilator_flags, "--threads 3" if $param{vltmt};
|
||||||
|
unshift @verilator_flags, "--debug-partition" if $param{vltmt};
|
||||||
if (defined $opt_optimize) {
|
if (defined $opt_optimize) {
|
||||||
my $letters = "";
|
my $letters = "";
|
||||||
if ($opt_optimize =~ /[a-zA-Z]/) {
|
if ($opt_optimize =~ /[a-zA-Z]/) {
|
||||||
|
|
@ -746,6 +752,11 @@ sub compile {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ($self->{vltmt} && !$self->cfg_with_threaded) {
|
||||||
|
$self->skip("Test requires Verilator configured with threads\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (!$param{fails} && $param{verilator_make_gcc}
|
if (!$param{fails} && $param{verilator_make_gcc}
|
||||||
&& $param{make_main}) {
|
&& $param{make_main}) {
|
||||||
$self->_make_main();
|
$self->_make_main();
|
||||||
|
|
@ -2045,7 +2056,11 @@ Run Synopsys VCS simulator tests.
|
||||||
|
|
||||||
=item --vlt
|
=item --vlt
|
||||||
|
|
||||||
Run Verilator tests. Default unless another scenario flag is provided.
|
Run Verilator tests in single-threaded mode. Default unless another scenario flag is provided.
|
||||||
|
|
||||||
|
=item --vltmt
|
||||||
|
|
||||||
|
Run Verilator tests in multithreaded mode.
|
||||||
|
|
||||||
=back
|
=back
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2003 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
scenarios(vlt_all => 1);
|
||||||
|
|
||||||
|
top_filename("t/t_EXAMPLE.v");
|
||||||
|
|
||||||
|
compile(
|
||||||
|
verilator_flags2 => ['--debug-self-test'],
|
||||||
|
verilator_make_gcc => 0,
|
||||||
|
make_top_shell => 0,
|
||||||
|
make_main => 0,
|
||||||
|
);
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -15,7 +15,8 @@ compile(
|
||||||
|
|
||||||
if ($Self->{vlt_all}) {
|
if ($Self->{vlt_all}) {
|
||||||
file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
|
file_grep ($Self->{stats}, qr/Optimizations, Tables created\s+(\d+)/i, 10);
|
||||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 8);
|
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||||
|
($Self->{vltmt} ? 0 : 8));
|
||||||
}
|
}
|
||||||
|
|
||||||
execute(
|
execute(
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
scenarios(vltmt => 1);
|
||||||
|
|
||||||
|
compile(
|
||||||
|
v_flags2 => ["t/t_dpi_threads_c.cpp --no-threads-coarsen"],
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(
|
||||||
|
check_finished => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
// DESCRIPTION: Verilator: Verilog Test module
|
||||||
|
//
|
||||||
|
// Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
// redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
// Version 2.0.
|
||||||
|
|
||||||
|
import "DPI-C" dpii_sys_task = function void \$dpii_sys ();
|
||||||
|
import "DPI-C" dpii_failure = function int \$dpii_failure ();
|
||||||
|
|
||||||
|
module t (clk);
|
||||||
|
input clk;
|
||||||
|
integer cyc;
|
||||||
|
integer failure;
|
||||||
|
|
||||||
|
initial cyc = 0;
|
||||||
|
|
||||||
|
`ifndef verilator
|
||||||
|
`error "Only Verilator supports PLI-ish DPI calls."
|
||||||
|
`endif
|
||||||
|
|
||||||
|
always @ (posedge clk) begin
|
||||||
|
if (cyc == 2) begin
|
||||||
|
failure = $dpii_failure();
|
||||||
|
$write("* failure = %0d\n", failure);
|
||||||
|
if (failure > 0) begin
|
||||||
|
$stop;
|
||||||
|
end
|
||||||
|
$write("*-* All Finished *-*\n");
|
||||||
|
$finish;
|
||||||
|
end
|
||||||
|
cyc <= cyc + 1;
|
||||||
|
end
|
||||||
|
|
||||||
|
// The purpose of this test is to confirm that the DPI-call serialization
|
||||||
|
// code in V3Partition does ensure that these DPI calls do not run
|
||||||
|
// concurrently.
|
||||||
|
//
|
||||||
|
// Alternatively, the test may be run with "--threads-dpi all" in which case
|
||||||
|
// it should confirm that the calls do run concurrently and do detect a
|
||||||
|
// collision (they should, if the test is set up right.) This is
|
||||||
|
// t_dpi_threads_collide.pl.
|
||||||
|
//
|
||||||
|
// Q) Is it a risk that the partitioner will merge or serialize these always
|
||||||
|
// blocks, just by luck, even if the DPI-call serialization code fails?
|
||||||
|
//
|
||||||
|
// A) Yes, that's why t_dpi_threads_collide.pl also passes
|
||||||
|
// --no-threads-do-coaren to disable MTask coarsening. This ensures that
|
||||||
|
// the MTask graph at the end of FixDataHazards (where we resolve DPI
|
||||||
|
// hazards) is basically the final MTasks graph, and that data hazards
|
||||||
|
// which persist beyond FixDataHazards should persist in the final
|
||||||
|
// generated C code.
|
||||||
|
|
||||||
|
always @ (posedge clk) begin
|
||||||
|
$dpii_sys();
|
||||||
|
end
|
||||||
|
|
||||||
|
always @ (posedge clk) begin
|
||||||
|
$dpii_sys();
|
||||||
|
end
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
|
@ -0,0 +1,78 @@
|
||||||
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
|
||||||
|
//*************************************************************************
|
||||||
|
//
|
||||||
|
// Copyright 2018-2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
// redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
// Lesser General Public License Version 3 or the Perl Artistic License.
|
||||||
|
// Version 2.0.
|
||||||
|
//
|
||||||
|
// Verilator is distributed in the hope that it will be useful,
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU General Public License for more details.
|
||||||
|
//
|
||||||
|
//*************************************************************************
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <iostream>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include "svdpi.h"
|
||||||
|
|
||||||
|
//======================================================================
|
||||||
|
|
||||||
|
#if defined(VERILATOR)
|
||||||
|
# ifdef T_DPI_THREADS_COLLIDE
|
||||||
|
# include "Vt_dpi_threads_collide__Dpi.h"
|
||||||
|
# else
|
||||||
|
# include "Vt_dpi_threads__Dpi.h"
|
||||||
|
# endif
|
||||||
|
#elif defined(VCS)
|
||||||
|
# include "../vc_hdrs.h"
|
||||||
|
#elif defined(CADENCE)
|
||||||
|
# define NEED_EXTERNS
|
||||||
|
#else
|
||||||
|
# error "Unknown simulator for DPI test"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef NEED_EXTERNS
|
||||||
|
extern "C" {
|
||||||
|
extern void dpii_sys_task();
|
||||||
|
extern int dpii_failure();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================
|
||||||
|
|
||||||
|
struct state {
|
||||||
|
std::atomic<bool> task_is_running;
|
||||||
|
std::atomic<int> failure;
|
||||||
|
state() : task_is_running(false)
|
||||||
|
, failure(false) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
static state st;
|
||||||
|
|
||||||
|
void dpii_sys_task() {
|
||||||
|
bool other_task_running = atomic_exchange(&st.task_is_running, true);
|
||||||
|
if (other_task_running) {
|
||||||
|
// Another task is running. This is a collision.
|
||||||
|
st.failure = 1;
|
||||||
|
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() saw threads collide.\n";
|
||||||
|
} else {
|
||||||
|
std::cerr << "t_dpi_threads_c.cpp dpii_sys_task() no collision. @" << &st.task_is_running << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Spend some time in the DPI call, so that if we can have a collision
|
||||||
|
// we probably will. Technically this is not guaranteed to detect every
|
||||||
|
// race. However, one second is so much greater than the expected
|
||||||
|
// runtime of everything else in the test, it really should pick up on
|
||||||
|
// races just about all of the time.
|
||||||
|
sleep(1);
|
||||||
|
|
||||||
|
atomic_exchange(&st.task_is_running, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
int dpii_failure() {
|
||||||
|
return st.failure;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,28 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2018 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
scenarios(vltmt => 1);
|
||||||
|
|
||||||
|
top_filename("t/t_dpi_threads.v");
|
||||||
|
|
||||||
|
compile(
|
||||||
|
v_flags2 => ["t/t_dpi_threads_c.cpp --threads-dpi all --no-threads-coarsen"],
|
||||||
|
);
|
||||||
|
|
||||||
|
# Similar to t_dpi_threads, which confirms that Verilator can prevent a
|
||||||
|
# race between DPI import calls, this test confirms that the race exists
|
||||||
|
# and that the DPI C code can detect it under --threads-dpi all
|
||||||
|
# mode.
|
||||||
|
#
|
||||||
|
execute(
|
||||||
|
fails => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -43,7 +43,10 @@ gen($Self->{top_filename}, 6000);
|
||||||
compile(
|
compile(
|
||||||
verilator_flags2=>["-x-assign fast --x-initial fast",
|
verilator_flags2=>["-x-assign fast --x-initial fast",
|
||||||
"-Wno-UNOPTTHREADS",
|
"-Wno-UNOPTTHREADS",
|
||||||
],
|
# The slow V3Partition asserts are just too slow
|
||||||
|
# in this test. They're disabled just for performance
|
||||||
|
# reasons:
|
||||||
|
"--no-debug-partition"],
|
||||||
);
|
);
|
||||||
|
|
||||||
execute(
|
execute(
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2003 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
use IO::File;
|
||||||
|
|
||||||
|
# Test for bin/verilator_gantt,
|
||||||
|
#
|
||||||
|
# Only needed in multithreaded regression.
|
||||||
|
scenarios(vltmt => 1);
|
||||||
|
|
||||||
|
# It doesn't really matter what test
|
||||||
|
# we use, so long as it runs several cycles,
|
||||||
|
# enough for the profiling to happen:
|
||||||
|
top_filename("t/t_gen_alw.v");
|
||||||
|
|
||||||
|
compile(
|
||||||
|
v_flags2 => ["--prof-threads"]
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(
|
||||||
|
all_run_flags => ["+verilator+prof+threads+start+2",
|
||||||
|
" +verilator+prof+threads+window+2",
|
||||||
|
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
|
||||||
|
],
|
||||||
|
check_finished => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
# For now, verilator_gantt still reads from STDIN
|
||||||
|
# (probably it should take a file, gantt.dat like verilator_profcfunc)
|
||||||
|
# The profiling data still goes direct to the runtime's STDOUT
|
||||||
|
# (maybe that should go to a separate file - gantt.dat?)
|
||||||
|
run(cmd => ["$ENV{VERILATOR_ROOT}/bin/verilator_gantt",
|
||||||
|
"$Self->{obj_dir}/profile_threads.dat",
|
||||||
|
"--vcd $Self->{obj_dir}/profile_threads.vcd",
|
||||||
|
"> $Self->{obj_dir}/gantt.log"]);
|
||||||
|
|
||||||
|
# We should have three lines of gantt chart, each with
|
||||||
|
# an even number of mtask-bars (eg "[123--]")
|
||||||
|
my $gantt_line_ct = 0;
|
||||||
|
my $global_mtask_ct = 0;
|
||||||
|
{
|
||||||
|
my $fh = IO::File->new("<$Self->{obj_dir}/gantt.log")
|
||||||
|
or error("$! $Self->{obj_dir}/gantt.log");
|
||||||
|
while (my $line = ($fh && $fh->getline)) {
|
||||||
|
if ($line !~ m/^ t:/) { next; }
|
||||||
|
$gantt_line_ct++;
|
||||||
|
my $this_thread_mtask_ct = 0;
|
||||||
|
my @mtasks = split(/\[/, $line);
|
||||||
|
shift @mtasks; # throw the '>> ' away
|
||||||
|
foreach my $mtask (@mtasks) {
|
||||||
|
# Format of each mtask is "[123--]" where the hyphens
|
||||||
|
# number or ] may or may not appear; it depends on exact timing.
|
||||||
|
$this_thread_mtask_ct++;
|
||||||
|
$global_mtask_ct++;
|
||||||
|
}
|
||||||
|
if ($this_thread_mtask_ct % 2 != 0) { error("odd number of mtasks found"); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($gantt_line_ct != 3) { error("wrong number of gantt lines"); }
|
||||||
|
if ($global_mtask_ct == 0) { error("wrong number of mtasks, should be > 0"); }
|
||||||
|
print "Found $gantt_line_ct lines of gantt data with $global_mtask_ct mtasks\n"
|
||||||
|
if $Self->{verbose};
|
||||||
|
|
||||||
|
# Diff to itself, just to check parsing
|
||||||
|
vcd_identical("$Self->{obj_dir}/profile_threads.vcd", "$Self->{obj_dir}/profile_threads.vcd");
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -117,6 +117,10 @@ compile(
|
||||||
);
|
);
|
||||||
|
|
||||||
execute(
|
execute(
|
||||||
|
all_run_flags => ["+verilator+prof+threads+start+100",
|
||||||
|
" +verilator+prof+threads+window+2",
|
||||||
|
" +verilator+prof+threads+file+$Self->{obj_dir}/profile_threads.dat",
|
||||||
|
],
|
||||||
check_finished => 1,
|
check_finished => 1,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ foreach my $prog (
|
||||||
"../bin/verilator",
|
"../bin/verilator",
|
||||||
"../bin/verilator_coverage",
|
"../bin/verilator_coverage",
|
||||||
"../bin/verilator_difftree",
|
"../bin/verilator_difftree",
|
||||||
|
"../bin/verilator_gantt",
|
||||||
"../bin/verilator_profcfunc",
|
"../bin/verilator_profcfunc",
|
||||||
) {
|
) {
|
||||||
run(fails => 1,
|
run(fails => 1,
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,8 @@ sub checkRelativeRefs {
|
||||||
if ($Self->{vlt_all}) {
|
if ($Self->{vlt_all}) {
|
||||||
# We expect to combine sequent functions across multiple instances of
|
# We expect to combine sequent functions across multiple instances of
|
||||||
# l2, l3, l4, l5. If this number drops, please confirm this has not broken.
|
# l2, l3, l4, l5. If this number drops, please confirm this has not broken.
|
||||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 52);
|
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||||
|
($Self->{vltmt} ? 84 : 52));
|
||||||
|
|
||||||
# Expect absolute refs in CFuncs for t (top module) and l1 (because it
|
# Expect absolute refs in CFuncs for t (top module) and l1 (because it
|
||||||
# has only one instance)
|
# has only one instance)
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,8 @@ compile(
|
||||||
if ($Self->{vlt_all}) {
|
if ($Self->{vlt_all}) {
|
||||||
# Fewer optimizations than t_inst_tree_inl0_pub1 which allows
|
# Fewer optimizations than t_inst_tree_inl0_pub1 which allows
|
||||||
# relative CFuncs:
|
# relative CFuncs:
|
||||||
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i, 31);
|
file_grep ($Self->{stats}, qr/Optimizations, Combined CFuncs\s+(\d+)/i,
|
||||||
|
($Self->{vltmt} ? 0 : 31));
|
||||||
|
|
||||||
# Should not find any 'this->' except some 'this->__VlSymsp'
|
# Should not find any 'this->' except some 'this->__VlSymsp'
|
||||||
my @files = `ls $Self->{obj_dir}/*.cpp`;
|
my @files = `ls $Self->{obj_dir}/*.cpp`;
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
# Version 2.0.
|
# Version 2.0.
|
||||||
|
|
||||||
scenarios(simulator => 1);
|
scenarios(vltmt => 1);
|
||||||
$Self->cfg_with_threaded or skip("No thread support");
|
|
||||||
|
|
||||||
top_filename("t/t_threads_counter.v");
|
top_filename("t/t_threads_counter.v");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
# Version 2.0.
|
# Version 2.0.
|
||||||
|
|
||||||
scenarios(simulator => 1);
|
scenarios(vltmt => 1);
|
||||||
$Self->cfg_with_threaded or skip("No thread support");
|
|
||||||
|
|
||||||
top_filename("t/t_threads_counter.v");
|
top_filename("t/t_threads_counter.v");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
scenarios(vltmt => 1);
|
||||||
|
|
||||||
|
top_filename("t/t_threads_counter.v");
|
||||||
|
|
||||||
|
compile(
|
||||||
|
verilator_flags2 => ['--cc --threads 4'],
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(
|
||||||
|
check_finished => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
|
||||||
|
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||||
|
#
|
||||||
|
# Copyright 2003-2009 by Wilson Snyder. This program is free software; you can
|
||||||
|
# redistribute it and/or modify it under the terms of either the GNU
|
||||||
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
|
# Version 2.0.
|
||||||
|
|
||||||
|
scenarios(vltmt => 1);
|
||||||
|
|
||||||
|
top_filename("t/t_threads_counter.v");
|
||||||
|
|
||||||
|
compile(
|
||||||
|
verilator_flags2 => ['--cc --threads 2 --debug-nondeterminism'],
|
||||||
|
);
|
||||||
|
|
||||||
|
execute(
|
||||||
|
check_finished => 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
file_grep("$Self->{obj_dir}/vlt_compile.log", qr/hash of shape/i);
|
||||||
|
|
||||||
|
ok(1);
|
||||||
|
1;
|
||||||
|
|
@ -13,7 +13,12 @@ my $root = "..";
|
||||||
|
|
||||||
compile(
|
compile(
|
||||||
# Can't use --coverage and --savable together, so cheat and compile inline
|
# Can't use --coverage and --savable together, so cheat and compile inline
|
||||||
verilator_flags2 => ['--cc --coverage-toggle --coverage-line --coverage-user --trace --vpi $root/include/verilated_save.cpp'],
|
verilator_flags2 => ["--cc",
|
||||||
|
"--coverage-toggle --coverage-line --coverage-user",
|
||||||
|
"--trace --vpi ",
|
||||||
|
($Self->cfg_with_threaded
|
||||||
|
? "--threads 2 $root/include/verilated_threads.cpp" : ""),
|
||||||
|
"$root/include/verilated_save.cpp"],
|
||||||
);
|
);
|
||||||
|
|
||||||
execute(
|
execute(
|
||||||
|
|
@ -43,7 +48,8 @@ foreach my $dfile (glob("$Self->{obj_dir}/*.d")) {
|
||||||
|
|
||||||
foreach my $file (sort keys %hit) {
|
foreach my $file (sort keys %hit) {
|
||||||
if (!$hit{$file}
|
if (!$hit{$file}
|
||||||
&& $file !~ /_sc/) {
|
&& $file !~ /_sc/
|
||||||
|
&& ($file !~ /_thread/ || $Self->cfg_with_threaded)) {
|
||||||
error("Include file not covered by t_verilated_all test: ",$file);
|
error("Include file not covered by t_verilated_all test: ",$file);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,7 @@ if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); di
|
||||||
# Lesser General Public License Version 3 or the Perl Artistic License
|
# Lesser General Public License Version 3 or the Perl Artistic License
|
||||||
# Version 2.0.
|
# Version 2.0.
|
||||||
|
|
||||||
scenarios(simulator => 1);
|
scenarios(vltmt => 1);
|
||||||
$Self->cfg_with_threaded or skip("No thread support");
|
|
||||||
|
|
||||||
top_filename("t/t_verilated_all.v");
|
top_filename("t/t_verilated_all.v");
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue