diff --git a/Changes b/Changes index 14b034de7..72f687a4d 100644 --- a/Changes +++ b/Changes @@ -5,15 +5,18 @@ The contributors that suggested a given feature are shown in []. Thanks! * Verilator 4.000 devel ** This is a major release. Any patches may require major rework to apply. + [Thanks everyone] + +** Add multithreaded model generation. ** Add runtime arguments. -** Fix internals to be C++ null-pointer-check clean. - *** Better optimize large always block splitting, bug1244. [John Coiner] *** Add new reloop optimization for repetitive assignment compression. +**** Fix internals to be C++ null-pointer-check clean. + **** Fix internals to avoid 'using namespace std'. **** Fix Verilation performance issues, bug1316. [John Coiner] diff --git a/Makefile.in b/Makefile.in index c864db313..c087a4736 100644 --- a/Makefile.in +++ b/Makefile.in @@ -120,6 +120,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \ bin/verilator \ bin/verilator_coverage \ bin/verilator_difftree \ + bin/verilator_gantt \ bin/verilator_includer \ bin/verilator_profcfunc \ doxygen-mainpage doxygen.config veripool-logo.png \ @@ -154,6 +155,7 @@ DISTFILES_INC = $(INFOS) .gitignore Artistic COPYING COPYING.LESSER \ INST_PROJ_FILES = \ bin/verilator \ bin/verilator_coverage \ + bin/verilator_gantt \ bin/verilator_includer \ bin/verilator_profcfunc \ include/verilated.mk \ @@ -272,12 +274,12 @@ internals.pdf: internals.pod Makefile # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff VL_INST_BIN_FILES = verilator verilator_bin verilator_bin_dbg verilator_coverage_bin_dbg \ - verilator_coverage verilator_includer verilator_profcfunc + verilator_coverage verilator_gantt verilator_includer verilator_profcfunc # Some scripts go into both the search path and pkgdatadir, # so they can be found by the user, and under $VERILATOR_ROOT. # See uninstall also - don't put wildcards in this variable, it might uninstall other stuff -VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_profcfunc.1 +VL_INST_MAN_FILES = verilator.1 verilator_coverage.1 verilator_gantt.1 verilator_profcfunc.1 VL_INST_INC_BLDDIR_FILES = \ include/verilated_config.h \ @@ -295,6 +297,7 @@ installbin: $(SHELL) ${srcdir}/mkinstalldirs $(DESTDIR)$(bindir) ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator $(DESTDIR)$(bindir)/verilator ) ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_coverage $(DESTDIR)$(bindir)/verilator_coverage ) + ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_gantt $(DESTDIR)$(bindir)/verilator_gantt ) ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_profcfunc $(DESTDIR)$(bindir)/verilator_profcfunc ) ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin $(DESTDIR)$(bindir)/verilator_bin ) ( cd ${srcdir}/bin ; $(INSTALL_PROGRAM) verilator_bin_dbg $(DESTDIR)$(bindir)/verilator_bin_dbg ) diff --git a/bin/verilator b/bin/verilator index 4f6516c2d..2a6a45cc2 100755 --- a/bin/verilator +++ b/bin/verilator @@ -338,6 +338,7 @@ detailed descriptions in L for more information. --pipe-filter Filter all input through a script --prefix Name of top level class --prof-cfuncs Name functions for profiling + --prof-threads Enable generating gantt chart data for threads --private Debugging; see docs --public Debugging; see docs -pvalue+= Overwrite toplevel parameter @@ -350,6 +351,9 @@ detailed descriptions in L for more information. --stats-vars Provide statistics on variables -sv Enable SystemVerilog parsing +systemverilogext+ Synonym for +1800-2017ext+ + --threads Enable multithreading + --threads-dpi Enable multithreaded DPI + --threads-max-mtasks Tune maximum mtask partitioning --top-module Name of top level input module --trace Enable waveform creation --trace-depth Depth of tracing @@ -386,6 +390,9 @@ detailed descriptions in L for more information. +verilator+debug Enable debugging +verilator+debugi+ Enable debugging at a level +verilator+help Display help + +verilator+prof+threads+file+I Set profile filename + +verilator+prof+threads+start+I Set profile starting point + +verilator+prof+threads+window+I Set profile duration +verilator+rand+reset+ Set random reset technique +verilator+V Verbose version and config +verilator+version Show version and exit @@ -1080,6 +1087,18 @@ Verilog module and line number the statement came from. This allows gprof or oprofile reports to be correlated with the original Verilog source statements. See also L. +=item --prof-threads + +Enable gantt chart data collection for threaded builds. + +Verilator will record the start and end time of each macro-task across a +number of calls to eval. (What is a macro-task? See the Verilator internals +document.) + +When profiling is enabled, the runtime will emit a blurb of profiling data +in non-human-friendly form. The C script will transform +this into a nicer visual format and produce some related statistics. + =item --private Opposite of --public. Is the default; this option exists for backwards @@ -1134,7 +1153,10 @@ Enable including save and restore functions in the generated model. The user code must create a VerilatedSerialize or VerilatedDeserialze object then calling the << or >> operators on the generated model and any -other data the process needs saved/restored. For example: +other data the process needs saved/restored. These functions are not +thread safe, and are typically called only by a main thread. + +For example: void save_model(const char* filenamep) { VerilatedSave os; @@ -1173,6 +1195,42 @@ compatibility with other simulators. A synonym for C<+1800-2017ext+>I. +=item --threads I + +=item --no-threads + +With --threads 0 or --no-threads, the default, the generated model is not +thread safe. With --threads 1, the generated model is single threaded but +may run in a multithreaded environment. With --threads N, where N >= 2, the +model is generated to run multithreaded on up to N threads. See +L. + +=item --threads-dpi all + +=item --threads-dpi none + +=item --threads-dpi pure + +When using --dpi with --threads, control what DPI tasks are thread safe. + +With --threads-dpi all, enable Verilator to assume all DPI imports are +threadsafe, and to use thread-local storage for communication with DPI, +potentially improving performance. Any DPI libraries need appropriate +mutexes to avoid undefined behavior. + +With --threads-dpi none, Verilator assume DPI imports are not thread safe, +and Verilator will serialize calls to DPI imports by default, potentially +harming performance. + +With --threads-dpi pure, the default, Verilator assumes DPI pure imports +are threadsafe, but non-pure DPI imports are not. + +=item --threads-max-mtasks I + +Rarely needed. When using --threads, specify the number of mtasks the +model is to be partitioned into. If unspecified, Verilator approximates a +good value. + =item --top-module I When the input Verilog contains more than one top level module, specifies @@ -1464,6 +1522,28 @@ Enable debugging at the provided level. Display help and exit. +=item +verilator+prof+threads+file+I + +When using --prof-threads, the filename to dump to. Defaults to +"profile_threads.dat". + +=item +verilator+prof+threads+start+I + +When using --prof-threads, Verilator will wait until $time is at this +value, then start the profiling warmup, then capturing. Generally this +should be set to some time that is well within the normal operation of the +simulation, i.e. outside of reset. If 0, the dump is disabled. Defaults to +1. + +=item +verilator+prof+threads+window+I + +When using --prof-threads, after $time reaches ++verilator+prof+threads+start, Verilator will warm up the profiling for +this number of eval() calls, then will capture the profiling of this number +of eval() calls. Defaults to 2, which makes sense for a +single-clock-domain module where it's typical to want to capture one +posedge eval() and one negedge eval(). + =item +verilator+rand+reset+I When a model was Verilated using "-x-inital unique", sets the @@ -1635,6 +1715,9 @@ compile times, and --x-assign=fast --x-initial=fast may increase the risk of reset bugs in trade for performance; see the above documentation for these flags. +If using Verilated multithreaded, use C to ensure you are using +non-conflicting hardware resources. See L. + Minor Verilog code changes can also give big wins. You should not have any UNOPTFLAT warnings from Verilator. Fixing these warnings can result in huge improvements; one user fixed their one UNOPTFLAT warning by making a @@ -2176,6 +2259,89 @@ the names of the .cpp files to compile in from the make variables generated in obj_dir/Vour_classes.mk. +=head1 MULTITHREADING + +Verilator experimentally supports multithreading. + +With --no-threads, the default, the model is not thread safe, and any use +of more than one thread calling into one or even different Verilated models +may result in unpredictable behavior. This gives the highest single thread +performance. + +With --threads 1, the generated model is single threaded, however the +support libraries are multithread safe. This allows different +instantiations of model(s) to potentially each be run under a different +thread. All threading is the responsibility of the user's C++ testbench. + +With --threads N, where N is at least 2, the generated model will be +designed to run in parallel on N threads. The thread calling eval() +provides one of those threads, and the generated model will create and +manage the other N-1 threads. It's the client's responsibility not to +oversubscribe the available CPU cores. Under CPU oversubscription, the +Verilated model should not livelock nor deadlock, however, you can expect +performance to be far worse than it would be with proper stoichiometry of +threads and CPU cores. + +The remainder of this section describe behavior with --threads 1 or +--threads N (not --no-threads). + +VL_THREADED is defined when compiling a threaded Verilated module, causing +the Verilated support classes become threadsafe. + +The thread used for constructing a model must the the same thread that +calls eval() into the model, this is called the "eval thread". The thread +used to perform certain global operations such as saving and tracing must +be done by a "main thread". In most cases the eval thread and main thread +are the same thread (i.e. the user's top C++ testbench runs on a single +thread), but this is not required. + +When running a multithreaded model, the default Linux task scheduler often +works against the model, by assuming threads are short lived, and thus +often schedules threads using multiple hyperthreads within the same +physical core. For best performance use the C program to (when the +threading count fits) select unique physical cores on the same socket. For +example, if a model was Verilated with "--threads 4", we consult + + egrep 'processor|physical id|core id' /proc/cpuinfo + +To select cores 0, 1, 2, and 3 that are all located on the same socket (0) +but different physical cores. (Also useful is "numactl --hardware", or +C but those doesn't show Hyperthreading cores.) Then we execute + + numactl -m 0 -C 0,1,2,3 -- verilated_executable_name + +This will limit memory to socket 0, and threads to cores 0, 1, 2, 3, +(presumably on socket 0) optimizing performance. Of course this must be +adjusted if you want another simulator using e.g. socket 1, or if you +Verilated with a different number of threads. To see what CPUs are +actually used, use --prof-threads. + +=head2 Multithreaded Verilog and Library Support + +$display/$stop/$finish are delayed until the end of an eval() call in order +to maintain ordering between threads. This may result in additional tasks +completing after the $stop or $finish. + +If using --coverage, the coverage routines are fully thread safe. + +If using --dpi, Verilator assumes pure DPI imports are thread safe, +balancing performance versus saftey. See --threads-dpi. + +If using --savable, the save/restore classes are not multithreaded and are +must be called only by the eval thread. + +If using --sc, the SystemC kernel is not thread safe, therefore the eval +thread and main thread must be the same. + +If using --trace, the tracing classes must be constructed and called from +the main thread. + +If using --vpi, since SystemVerilog VPI was not architected by IEEE to be +multithreaded, Verilator requires all VPI calls are only made from the main +thread. + +=back + =head1 CONFIGURATION FILES In addition to the command line, warnings and other features may be @@ -3636,6 +3802,21 @@ section for more details. Ignoring this warning will only slow simulations, it will simulate correctly. +=item UNOPTTHREADS + +Warns that the thread scheduler was unable to partition the design to fill +the requested number of threads. + +One workaround is to request fewer threads with C<--threads>. + +Another possible workaround is to allow more MTasks in the runtime, by +increasing the value of --threads-max-mtasks. More MTasks will result in +more communication and synchronization overhead at runtime; the scheduler +attempts to minimize the number of MTasks for this reason. + +Ignoring this warning will only slow simulations, it will simulate +correctly. + =item UNPACKED Warns that unpacked structs and unions are not supported. @@ -4185,6 +4366,8 @@ performance gain. In 2009, major SystemVerilog and DPI language support was added. +In 2018, Verilator 4.000 was released with multithreaded support. + Currently, various language features and performance enhancements are added as the need arises. Verilator is now about 3x faster than in 2002, and is faster than many popular commercial simulators. @@ -4282,7 +4465,7 @@ License Version 2.0. =head1 SEE ALSO -L, L, L, +L, L, L, L, L which is the source for this document, diff --git a/bin/verilator_gantt b/bin/verilator_gantt new file mode 100755 index 000000000..cf4bf6c65 --- /dev/null +++ b/bin/verilator_gantt @@ -0,0 +1,559 @@ +: # -*-Mode: perl;-*- use perl, wherever it is +eval 'exec perl -wS $0 ${1+"$@"}' + if 0; +# See copyright, etc in below POD section. +###################################################################### + +use strict; +use warnings; +use Getopt::Long; +use Pod::Usage; +use vars qw ($Debug); + +$Debug = 0; +my $Opt_File; +my $Opt_Time_Per_Char = 0; # rdtsc ticks per char in gantt chart, 0=auto +my $opt_vcd = "profile_threads.vcd"; + +our %Threads; +our %Mtasks; +our %Global; + +autoflush STDOUT 1; +autoflush STDERR 1; +Getopt::Long::config ("no_auto_abbrev"); +if (! GetOptions ( + "help" => \&usage, + "scale=i" => \$Opt_Time_Per_Char, + "debug" => sub { $Debug = 1; }, + "vcd=s" => \$opt_vcd, + "no-vcd!" => sub { $opt_vcd = undef; }, + "<>" => \¶meter, + )) { + die "%Error: Bad usage, try 'verilator_gantt --help'\n"; +} + +$Opt_File = "profile_threads.dat" if !defined $Opt_File; + +process($Opt_File); +write_vcd($opt_vcd) if defined $opt_vcd; +exit(0); + +####################################################################### + +sub usage { + pod2usage(-verbose=>2, -exitval=>2, -output=>\*STDOUT); + exit (1); +} + +sub parameter { + my $param = shift; + if (!defined $Opt_File) { + $Opt_File = $param; + } else { + die "%Error: Unknown parameter: $param\n"; + } +} + +####################################################################### + +sub process { + my $filename = shift; + + read_data($filename); + report(); +} + +####################################################################### + +sub read_data { + my $filename = shift; + + %Global = (rdtsc_cycle_time => 0); + + my $fh = IO::File->new ($filename) or die "%Error: $! $filename,"; + while (my $line = $fh->getline) { + if ($line =~ m/VLPROF mtask\s(\d+)\sstart\s(\d+)\send\s(\d+)\selapsed\s(\d+)\spredict_time\s(\d+)\scpu\s(\d+)\son thread (\d+)/) { + my $mtask = $1; + my $start = $2; + my $end = $3; + my $elapsed_time = $4; + my $predict_time = $5; + my $cpu = $6; + my $thread = $7; + $Threads{$thread}{$start}{mtask} = $mtask; + $Threads{$thread}{$start}{end} = $end; + $Threads{$thread}{$start}{cpu} = $cpu; + + if (!exists $Mtasks{$mtask}{elapsed}) { + $Mtasks{$mtask}{elapsed} = 0; + } + $Mtasks{$mtask}{elapsed} += $elapsed_time; + $Mtasks{$mtask}{predict} = $predict_time; + $Mtasks{$mtask}{end} = max($Mtasks{$mtask}{end}, $end); + } + elsif ($line =~ /^VLPROFTHREAD/) {} + elsif ($line =~ m/VLPROF arg\s+(\S+)\+([0-9.])\s*$/ + || $line =~ m/VLPROF arg\s+(\S+)\s+([0-9.])\s*$/) { + $Global{args}{$1} = $2; + } + elsif ($line =~ m/VLPROF stat\s+(\S+)\s+([0-9.]+)/) { + $Global{stats}{$1} = $2; + } + elsif ($line =~ /^#/) {} + elsif ($Debug) { + chomp $line; + print "Unk: $line\n"; + } + # TODO -- this is parsing text printed by a client. + # Really, verilator proper should generate this + # if it's useful... + if ($line =~ m/rdtsc time = (\d+) ticks/) { + $Global{rdtsc_cycle_time} = $1; + } + } +} + +sub report { + print "Verilator Gantt report\n"; + + print "\nArgument settings:\n"; + foreach my $arg (sort keys %{$Global{args}}) { + my $plus = ($arg =~ /^\+/) ? "+" : " "; + printf " %s%s%d\n", $arg, $plus, $Global{args}{$arg}; + } + + my $nthreads = scalar keys %Threads; + $Global{cpus}{cpu_time} = {}; + foreach my $thread (keys %Threads) { + # Make potentially multiple characters per column + foreach my $start (keys %{$Threads{$thread}}) { + my $cpu = $Threads{$thread}{$start}{cpu}; + my $elapsed = $Threads{$thread}{$start}{end} - $start; + $Global{cpus}{cpu_time}{$cpu} += $elapsed; + } + } + + my $mt_mtask_time = 0; + my $long_mtask_time = 0; + my $last_end = 0; + foreach my $mtask (keys %Mtasks) { + $mt_mtask_time += $Mtasks{$mtask}{elapsed}; + $last_end = max($last_end, $Mtasks{$mtask}{end}); + $long_mtask_time = max($long_mtask_time, $Mtasks{$mtask}{elapsed}); + } + $Global{last_end} = $last_end; + + report_graph(); + + # If we know cycle time in the same (rdtsc) units, + # this will give us an actual utilization number, + # (how effectively we keep the cores busy.) + # + # It also gives us a number we can compare against + # serial mode, to estimate the overhead of data sharing, + # which will show up in the total elapsed time. (Overhead + # of synchronization and scheduling should not.) + print "\nAnalysis:\n"; + printf " Total threads = %d\n", $nthreads; + printf " Total mtasks = %d\n", scalar (keys %Mtasks); + printf " Total cpus used = %d\n", scalar (keys %{$Global{cpus}}); + printf " Total yields = %d\n", $Global{stats}{yields}; + printf " Total eval time = %d rdtsc ticks\n", $Global{last_end}; + printf " Longest mtask time = %d rdtsc ticks\n", $long_mtask_time; + printf " All-thread mtask time = %d rdtsc ticks\n", $mt_mtask_time; + my $long_efficiency = $long_mtask_time/($Global{last_end}); + printf " Longest-thread efficiency = %0.1f%%\n", $long_efficiency*100; + my $mt_efficiency = $mt_mtask_time/($Global{last_end}*$nthreads); + printf " All-thread efficiency = %0.1f%%\n", $mt_efficiency*100; + printf " All-thread speedup = %0.1f\n", $mt_efficiency*$nthreads; + if ($Global{rdtsc_cycle_time} > 0) { + my $ut = $mt_mtask_time / $Global{rdtsc_cycle_time}; + print "tot_mtask_cpu=$mt_mtask_time cyc=$Global{rdtsc_cycle_time} ut=$ut\n"; + } + + my @p2e_ratios; + my $min_p2e = 1000000; + my $min_mtask; + my $max_p2e = -1000000; + my $max_mtask; + foreach my $mtask (sort keys %Mtasks) { + if ($Mtasks{$mtask}{elapsed} > 0) { + if ($Mtasks{$mtask}{predict} == 0) { + $Mtasks{$mtask}{predict} = 1; # don't log(0) below + } + my $p2e_ratio = log( $Mtasks{$mtask}{predict} / $Mtasks{$mtask}{elapsed} ); + #print "log(p2e $mtask) = $p2e_ratio (predict $Mtasks{$mtask}{predict}, elapsed $Mtasks{$mtask}{elapsed})\n"; + push @p2e_ratios, $p2e_ratio; + + if ($p2e_ratio > $max_p2e) { + $max_p2e = $p2e_ratio; + $max_mtask = $mtask; + } + if ($p2e_ratio < $min_p2e) { + $min_p2e = $p2e_ratio; + $min_mtask = $mtask; + } + } + } + + print "\nStatistics:\n"; + print " min log(p2e) = $min_p2e from mtask $min_mtask (predict $Mtasks{$min_mtask}{predict}, elapsed $Mtasks{$min_mtask}{elapsed})\n"; + print " max log(p2e) = $max_p2e from mtask $max_mtask (predict $Mtasks{$max_mtask}{predict}, elapsed $Mtasks{$max_mtask}{elapsed})\n"; + + my $stddev = stddev(\@p2e_ratios); + my $mean = mean(\@p2e_ratios); + print " mean = " . ($mean) . "\n"; + print " stddev = " . ($stddev) . "\n"; + print " e ^ stddev = " . exp($stddev). "\n"; + print "\n"; +} + +sub report_graph { + my $time_per = $Opt_Time_Per_Char; + if ($time_per == 0) { + $time_per = ($Global{last_end} / 40); # Start with 40 columns + while ($time_per > 10) { + my ($graph, $conflicts) = _make_graph($time_per); + last if !$conflicts; + $time_per = int($time_per/2); + } + # One more step so we can fit more labels + $time_per = int($time_per/2); + } + + my ($graph, $conflicts) = _make_graph($time_per); + + print "\nThread gantt graph:\n"; + print " Legend: One character width = $time_per rdtsc ticks\n"; + print " Legend: '&' = multiple mtasks in this period (character width)\n"; + + my $scale = " <-".$Global{last_end}." rdtsc total"; + for (my $col = length($scale); # -2 for '->' below + $col < ($Global{last_end}/$time_per); ++$col) { + $scale .= "-"; + } + print " $scale->\n"; + + foreach my $thread (sort keys %{$graph}) { + print " t: "; + _print_graph_line($graph->{$thread}, ''); + } +} + +sub _make_graph { + my $time_per = shift; + + my $graph = {}; # {thread}{column}{char=>'x' or chars=>#} + my $conflicts = 0; + foreach my $thread (keys %Threads) { + # Make potentially multiple characters per column + foreach my $start (sort {$a <=> $b} keys %{$Threads{$thread}}) { + my $end = $Threads{$thread}{$start}{end}; + my $mtask = $Threads{$thread}{$start}{mtask}; + my $cpu = $Threads{$thread}{$start}{cpu}; + + my $startcol = _time_col($time_per, $start); + my $endcol = _time_col($time_per, $end); + + my $label = "["; + $label .= "$cpu"; # Maybe make optional in future + my $width = $endcol - $startcol + 1; + while (length($label) < ($width-1)) { # -1 for ']' + $label .= "-"; + } + $label .= "]"; + $graph->{$thread}[$startcol]{char} .= $label; + } + if ($Debug) { + print "# Multicol: "; _print_graph_line($graph->{$thread}, '|'); + } + # Expand line to one char per column + for (my $col = 0; $col <= $#{$graph->{$thread}}; ++$col) { + if (my $chars = $graph->{$thread}[$col]{char}) { + my $ok = 1; + for (my $coladd = 1; $coladd{$thread}[$col + $coladd]{char}) { + $ok = 0; last; + } + } + if (!$ok) { + if ($chars =~ /\[.*\[/) { # Two begins or more + $conflicts++; + $graph->{$thread}[$col]{char} = "&"; + } else { + $graph->{$thread}[$col]{char} = "["; + } + for (my $coladd = 1; $coladd{$thread}[$col + $coladd]{char}) { + last; + } else { + $graph->{$thread}[$col + $coladd]{char} = 'x'; + } + } + } else { + my $coladd = 0; + foreach my $char (split //, $chars) { + $graph->{$thread}[$col+$coladd]{char} = $char; + ++$coladd; + } + } + } + } + if ($Debug) { + print "# Singlcol: "; _print_graph_line($graph->{$thread}, '|'); + } + } + print "# Conflicts $conflicts\n" if $Debug; + return ($graph, $conflicts); +} + +sub _print_graph_line { + my $graph_thread = shift; + my $sep = shift; + for (my $col = 0; $col <= $#{$graph_thread}; ++$col) { + my $c = $graph_thread->[$col]{char}; $c=' ' if !defined $c; + print $c, $sep; + } + print "\n"; +} + +sub _time_col { + my $time_per = shift; + my $time = shift; + return int($time/$time_per); +} + +####################################################################### + +sub write_vcd { + my $filename = shift; + print "Writing $filename\n"; + my $fh = IO::File->new(">$filename") or die "%Error: $! $filename,"; + my $vcd = {values => {}, # {