diff --git a/bin/verilator_gantt b/bin/verilator_gantt index 59655d464..6938c9ed2 100755 --- a/bin/verilator_gantt +++ b/bin/verilator_gantt @@ -26,7 +26,9 @@ Global = { } ElapsedTime = None # total elapsed time ExecGraphTime = 0 # total elapsed time executing an exec graph +WaitingTime = 0 # total elapsed time waiting for mtasks ExecGraphIntervals = [] # list of (start, end) pairs +ThreadScheduleWaitIntervals = [] # list of (start, tick, ecpu) pairs ###################################################################### @@ -37,6 +39,7 @@ def read_data(filename): re_record = re.compile(r'^VLPROFEXEC (\S+) (\d+)(.*)$') re_payload_mtaskBegin = re.compile(r'id (\d+) predictStart (\d+) cpu (\d+)') re_payload_mtaskEnd = re.compile(r'id (\d+) predictCost (\d+)') + re_payload_wait = re.compile(r'cpu (\d+)') re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*') re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$') @@ -49,9 +52,11 @@ def read_data(filename): global LongestVcdStrValueLength global ExecGraphTime + global WaitingTime ExecGraphStack = [] SectionStack = [] + ThreadScheduleWait = collections.defaultdict(list) mTaskThread = {} for line in fh: @@ -96,6 +101,14 @@ def read_data(filename): Mtasks[mtask]['elapsed'] += tick - begin Mtasks[mtask]['predict_cost'] = predict_cost Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick) + elif kind == "THREAD_SCHEDULE_WAIT_BEGIN": + ecpu = int(re_payload_wait.match(payload).groups()[0]) + ThreadScheduleWait[ecpu].append(tick) + elif kind == "THREAD_SCHEDULE_WAIT_END": + ecpu = int(re_payload_wait.match(payload).groups()[0]) + start = ThreadScheduleWait[ecpu].pop() + WaitingTime += tick - start + ThreadScheduleWaitIntervals.append((start, tick, ecpu)) elif kind == "EXEC_GRAPH_BEGIN": ExecGraphStack.append(tick) elif kind == "EXEC_GRAPH_END": @@ -169,6 +182,7 @@ def report(): print("\nSummary:") print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime)) print(" Parallelized code = {:.2%} of elapsed time".format(ExecGraphTime / ElapsedTime)) + print(" Waiting time = {:.2%} of elapsed time".format(WaitingTime / ElapsedTime)) print(" Total threads = %d" % nthreads) print(" Total CPUs used = %d" % ncpus) print(" Total mtasks = %d" % len(Mtasks)) @@ -422,6 +436,9 @@ def write_vcd(filename): val = buf.ljust(LongestVcdStrValueLength * 8, "0") values[time][code] = val + def addXValue(code, time): + values[time][code] = 'x' + # Measured graph for thread in sorted(Threads): mcode = getCode(32, 'measured', 't%d_mtask' % thread) @@ -446,6 +463,12 @@ def write_vcd(filename): tStart = sorted(_['start'] for records in Threads.values() for _ in records) tEnd = sorted(_['end'] for records in Threads.values() for _ in records) + # Measured waiting time + for (start, end, cpu) in ThreadScheduleWaitIntervals: + mcode = getCode(32, 'measured', 'cpu%d_waiting' % cpu) + addXValue(mcode, start) + addValue(mcode, end, None) + # Predicted graph for start, end in ExecGraphIntervals: # Find the earliest MTask start after the start point, and the @@ -532,6 +555,8 @@ def write_vcd(filename): value = values[time][code] if value is None: fh.write("bz v%x\n" % code) + elif value == 'x': + fh.write("bx v%x\n" % code) elif isinstance(value, str): fh.write("b%s v%x\n" % (value, code)) else: diff --git a/docs/guide/exe_verilator_gantt.rst b/docs/guide/exe_verilator_gantt.rst index 9916b0a87..135ba1a8f 100644 --- a/docs/guide/exe_verilator_gantt.rst +++ b/docs/guide/exe_verilator_gantt.rst @@ -51,6 +51,10 @@ predicted_parallelism cpu#_thread For the given CPU number, the thread number measured to be executing. +cpu#_waiting + For the given CPU number, aggregated waiting time for mtask dependencies. + Visualized as `X` values. + mtask#_cpu For the given mtask id, the CPU it was measured to execute on. diff --git a/include/verilated_profiler.cpp b/include/verilated_profiler.cpp index 3d3a6d697..e6ba57b30 100644 --- a/include/verilated_profiler.cpp +++ b/include/verilated_profiler.cpp @@ -199,6 +199,12 @@ void VlExecutionProfiler::dump(const char* filenamep, uint64_t tickEnd) fprintf(fp, " id %u predictCost %u\n", payload.m_id, payload.m_predictCost); break; } + case VlExecutionRecord::Type::THREAD_SCHEDULE_WAIT_BEGIN: + case VlExecutionRecord::Type::THREAD_SCHEDULE_WAIT_END: { + const auto& payload = er.m_payload.threadScheduleWait; + fprintf(fp, " cpu %u\n", payload.m_cpu); + break; + } case VlExecutionRecord::Type::SECTION_PUSH: { const auto& payload = er.m_payload.sectionPush; fprintf(fp, " %s\n", payload.m_name); diff --git a/include/verilated_profiler.h b/include/verilated_profiler.h index 5023dcf07..26d669522 100644 --- a/include/verilated_profiler.h +++ b/include/verilated_profiler.h @@ -62,6 +62,8 @@ VL_ATTR_ALWINLINE QData VL_CPU_TICK() { _VL_FOREACH_APPLY(macro, SECTION_POP) \ _VL_FOREACH_APPLY(macro, MTASK_BEGIN) \ _VL_FOREACH_APPLY(macro, MTASK_END) \ + _VL_FOREACH_APPLY(macro, THREAD_SCHEDULE_WAIT_BEGIN) \ + _VL_FOREACH_APPLY(macro, THREAD_SCHEDULE_WAIT_END) \ _VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \ _VL_FOREACH_APPLY(macro, EXEC_GRAPH_END) // clang-format on @@ -95,6 +97,9 @@ class VlExecutionRecord final { uint32_t m_id; // MTask id uint32_t m_predictCost; // How long scheduler predicted would take } mtaskEnd; + struct { + uint32_t m_cpu; // Executing CPU id + } threadScheduleWait; }; // STATE @@ -126,6 +131,14 @@ public: m_payload.mtaskEnd.m_predictCost = predictCost; m_type = Type::MTASK_END; } + void threadScheduleWaitBegin() { + m_payload.threadScheduleWait.m_cpu = VlOs::getcpu(); + m_type = Type::THREAD_SCHEDULE_WAIT_BEGIN; + } + void threadScheduleWaitEnd() { + m_payload.threadScheduleWait.m_cpu = VlOs::getcpu(); + m_type = Type::THREAD_SCHEDULE_WAIT_END; + } void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; } void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; } }; diff --git a/src/V3ExecGraph.cpp b/src/V3ExecGraph.cpp index 25bd7d864..696cc92c1 100644 --- a/src/V3ExecGraph.cpp +++ b/src/V3ExecGraph.cpp @@ -726,7 +726,13 @@ void addMTaskToFunction(const ThreadSchedule& schedule, const uint32_t threadId, varp->protect(false); // Do not protect as we still have references in AstText modp->addStmtsp(varp); // For now, reference is still via text bashing + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitBegin();\n"); + } addStrStmt("vlSelf->" + name + +".waitUntilUpstreamDone(even_cycle);\n"); + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitEnd();\n"); + } } if (v3Global.opt.profPgo()) { @@ -878,8 +884,14 @@ void addThreadStartToExecGraph(AstExecGraph* const execGraphp, } V3Stats::addStatSum("Optimizations, Thread schedule total tasks", i); + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitBegin();\n"); + } addStrStmt("vlSelf->__Vm_mtaskstate_final__" + std::to_string(scheduleId) + tag + ".waitUntilUpstreamDone(vlSymsp->__Vm_even_cycle__" + tag + ");\n"); + if (v3Global.opt.profExec()) { + addStrStmt("VL_EXEC_TRACE_ADD_RECORD(vlSymsp).threadScheduleWaitEnd();\n"); + } // Free all assigned worker indices in this section if (!v3Global.opt.hierBlocks().empty() && last > 0) { addStrStmt("vlSymsp->__Vm_threadPoolp->freeWorkerIndexes(indexes);\n"); diff --git a/test_regress/t/t_gantt_io.dat b/test_regress/t/t_gantt_io.dat index c5c97d676..78ffc13aa 100644 --- a/test_regress/t/t_gantt_io.dat +++ b/test_regress/t/t_gantt_io.dat @@ -910,6 +910,8 @@ VLPROFEXEC EXEC_GRAPH_END 12180 VLPROFEXEC EXEC_GRAPH_BEGIN 14000 VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19 VLPROFEXEC MTASK_END 15820 id 6 predictCost 30 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 20000 cpu 19 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 21000 cpu 19 VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19 VLPROFEXEC MTASK_END 21875 id 10 predictCost 30 VLPROFEXEC EXEC_GRAPH_END 22085 @@ -934,4 +936,6 @@ VLPROFEXEC MTASK_BEGIN 20650 id 9 predictStart 167 cpu 10 VLPROFEXEC MTASK_END 20720 id 9 predictCost 30 VLPROFEXEC MTASK_BEGIN 21140 id 11 predictStart 197 cpu 10 VLPROFEXEC MTASK_END 21245 id 11 predictCost 30 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 22000 cpu 10 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 23000 cpu 10 VLPROF stat ticks 23415 diff --git a/test_regress/t/t_gantt_io.out b/test_regress/t/t_gantt_io.out index 3a63ceec1..e27de3a0b 100644 --- a/test_regress/t/t_gantt_io.out +++ b/test_regress/t/t_gantt_io.out @@ -7,6 +7,7 @@ Argument settings: Summary: Total elapsed time = 23415 rdtsc ticks Parallelized code = 82.51% of elapsed time + Waiting time = 8.54% of elapsed time Total threads = 2 Total CPUs used = 2 Total mtasks = 7 diff --git a/test_regress/t/t_gantt_io.vcd.out b/test_regress/t/t_gantt_io.vcd.out index c95e55a4b..c41cd86e7 100644 --- a/test_regress/t/t_gantt_io.vcd.out +++ b/test_regress/t/t_gantt_io.vcd.out @@ -5,18 +5,20 @@ $timescale 1ns $end $scope module measured $end $var wire 32 v5 cpu10_mtask [31:0] $end $var wire 32 v4 cpu10_thread [31:0] $end + $var wire 32 v7 cpu10_waiting [31:0] $end $var wire 32 v2 cpu19_mtask [31:0] $end $var wire 32 v1 cpu19_thread [31:0] $end + $var wire 32 v6 cpu19_waiting [31:0] $end $var wire 32 v0 t0_mtask [31:0] $end $var wire 32 v3 t1_mtask [31:0] $end $upscope $end $scope module predicted $end - $var wire 32 v6 t0_mtask [31:0] $end - $var wire 32 v7 t1_mtask [31:0] $end + $var wire 32 v8 t0_mtask [31:0] $end + $var wire 32 v9 t1_mtask [31:0] $end $upscope $end $scope module stats $end - $var wire 32 v8 measured_parallelism [31:0] $end - $var wire 32 v9 predicted_parallelism [31:0] $end + $var wire 32 va measured_parallelism [31:0] $end + $var wire 32 vb predicted_parallelism [31:0] $end $upscope $end $upscope $end $enddefinitions $end @@ -30,193 +32,203 @@ bz v4 bz v5 bz v6 bz v7 -b0 v8 -b0 v9 +bz v8 +bz v9 +b0 va +b0 vb #2695 b110 v0 b0 v1 b110 v2 -b110 v6 -b101 v7 -b1 v8 -b10 v9 +b110 v8 +b101 v9 +b1 va +b10 vb #2905 bz v0 bz v1 bz v2 -b0 v8 +b0 va #3800 -bz v6 -b111 v7 -b1 v9 +bz v8 +b111 v9 +b1 vb #4906 -b1000 v7 -b1 v9 +b1000 v9 +b1 vb #5495 b101 v3 b1 v4 b101 v5 -b1 v8 +b1 va #6090 bz v3 bz v4 bz v5 -b0 v8 +b0 va #6300 b111 v3 b1 v4 b111 v5 -b1 v8 +b1 va #6895 bz v3 bz v4 bz v5 -b0 v8 +b0 va #7490 b1000 v3 b1 v4 b1000 v5 -b1 v8 +b1 va #8540 bz v3 bz v4 bz v5 -b0 v8 +b0 va #8848 -b1001 v7 -b1 v9 +b1001 v9 +b1 vb #9135 b1001 v3 b1 v4 b1001 v5 -b1 v8 +b1 va #9695 b1010 v0 b0 v1 b1010 v2 -b10 v8 +b10 va #9730 bz v3 bz v4 bz v5 -b1 v8 +b1 va #9870 bz v0 bz v1 bz v2 -b0 v8 +b0 va #9917 -b1010 v6 -b10 v9 +b1010 v8 +b10 vb #9954 -b1011 v7 -b10 v9 +b1011 v9 +b10 vb #10255 b1011 v3 b1 v4 b1011 v5 -b1 v8 +b1 va #11023 -bz v6 -b1 v9 +bz v8 +b1 vb #11060 bz v3 bz v4 bz v5 -bz v7 -b0 v8 -b0 v9 +bz v9 +b0 va +b0 vb #15610 b110 v0 b0 v1 b110 v2 -b110 v6 -b101 v7 -b1 v8 -b10 v9 +b110 v8 +b101 v9 +b1 va +b10 vb #15820 bz v0 bz v1 bz v2 -b0 v8 +b0 va #16437 -bz v6 -b111 v7 -b1 v9 +bz v8 +b111 v9 +b1 vb #17265 -b1000 v7 -b1 v9 +b1000 v9 +b1 vb #18375 b101 v3 b1 v4 b101 v5 -b1 v8 +b1 va #18970 bz v3 bz v4 bz v5 -b0 v8 +b0 va #19145 b111 v3 b1 v4 b111 v5 -b1 v8 +b1 va #19320 bz v3 bz v4 bz v5 -b0 v8 +b0 va #19670 b1000 v3 b1 v4 b1000 v5 -b1 v8 +b1 va #19810 bz v3 bz v4 bz v5 -b0 v8 +b0 va +#20000 +bx v6 #20219 -b1001 v7 -b1 v9 +b1001 v9 +b1 vb #20650 b1001 v3 b1 v4 b1001 v5 -b1 v8 +b1 va #20720 bz v3 bz v4 bz v5 -b0 v8 +b0 va +#21000 +bz v6 #21019 -b1010 v6 -b10 v9 +b1010 v8 +b10 vb #21047 -b1011 v7 -b10 v9 +b1011 v9 +b10 vb #21140 b1011 v3 b1 v4 b1011 v5 -b1 v8 +b1 va #21245 bz v3 bz v4 bz v5 -b0 v8 +b0 va #21700 b1010 v0 b0 v1 b1010 v2 -b1 v8 +b1 va #21847 -bz v6 -b1 v9 +bz v8 +b1 vb #21875 bz v0 bz v1 bz v2 +bz v9 +b0 va +b0 vb +#22000 +bx v7 +#23000 bz v7 -b0 v8 -b0 v9 diff --git a/test_regress/t/t_gantt_io_arm.dat b/test_regress/t/t_gantt_io_arm.dat index 4fd34cfc9..819b7c40d 100644 --- a/test_regress/t/t_gantt_io_arm.dat +++ b/test_regress/t/t_gantt_io_arm.dat @@ -50,6 +50,8 @@ VLPROFEXEC MTASK_BEGIN 90465 id 85 predictStart 14315 cpu 2 VLPROFEXEC MTASK_END 155034 id 85 predictCost 30533 VLPROFEXEC MTASK_BEGIN 156555 id 79 predictStart 44848 cpu 2 VLPROFEXEC MTASK_END 294309 id 79 predictCost 48001 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 295000 cpu 2 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 296000 cpu 2 VLPROFEXEC EXEC_GRAPH_END 300000 VLPROFTHREAD 1 VLPROFEXEC MTASK_BEGIN 77352 id 90 predictStart 14315 cpu 3 @@ -58,4 +60,6 @@ VLPROFEXEC MTASK_BEGIN 79799 id 81 predictStart 35907 cpu 3 VLPROFEXEC MTASK_END 80667 id 81 predictCost 29215 VLPROFEXEC MTASK_BEGIN 81746 id 87 predictStart 65147 cpu 3 VLPROFEXEC MTASK_END 82633 id 87 predictCost 33809 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 83000 cpu 3 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 84000 cpu 3 VLPROF stat ticks 300000 diff --git a/test_regress/t/t_gantt_io_arm.out b/test_regress/t/t_gantt_io_arm.out index 854b5ad77..a89d4f268 100644 --- a/test_regress/t/t_gantt_io_arm.out +++ b/test_regress/t/t_gantt_io_arm.out @@ -7,6 +7,7 @@ Argument settings: Summary: Total elapsed time = 300000 rdtsc ticks Parallelized code = 80.49% of elapsed time + Waiting time = 0.67% of elapsed time Total threads = 2 Total CPUs used = 2 Total mtasks = 5 diff --git a/test_regress/t/t_gantt_io_noproc.dat b/test_regress/t/t_gantt_io_noproc.dat index 02d2b3442..7fd00bf40 100644 --- a/test_regress/t/t_gantt_io_noproc.dat +++ b/test_regress/t/t_gantt_io_noproc.dat @@ -13,6 +13,8 @@ VLPROFEXEC EXEC_GRAPH_END 12180 VLPROFEXEC EXEC_GRAPH_BEGIN 14000 VLPROFEXEC MTASK_BEGIN 15610 id 6 predictStart 0 cpu 19 VLPROFEXEC MTASK_END 15820 id 6 predictCost 30 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 16000 cpu 19 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 17000 cpu 19 VLPROFEXEC MTASK_BEGIN 21700 id 10 predictStart 196 cpu 19 VLPROFEXEC MTASK_END 21875 id 10 predictCost 30 VLPROFEXEC EXEC_GRAPH_END 22085 @@ -27,6 +29,8 @@ VLPROFEXEC MTASK_BEGIN 9135 id 9 predictStart 167 cpu 10 VLPROFEXEC MTASK_END 9730 id 9 predictCost 30 VLPROFEXEC MTASK_BEGIN 10255 id 11 predictStart 197 cpu 10 VLPROFEXEC MTASK_END 11060 id 11 predictCost 30 +VLPROFEXEC THREAD_SCHEDULE_WAIT_BEGIN 17000 cpu 10 +VLPROFEXEC THREAD_SCHEDULE_WAIT_END 18000 cpu 10 VLPROFEXEC MTASK_BEGIN 18375 id 5 predictStart 0 cpu 10 VLPROFEXEC MTASK_END 18970 id 5 predictCost 30 VLPROFEXEC MTASK_BEGIN 19145 id 7 predictStart 30 cpu 10 diff --git a/test_regress/t/t_gantt_io_noproc.out b/test_regress/t/t_gantt_io_noproc.out index 86e4241f6..2de29d6c3 100644 --- a/test_regress/t/t_gantt_io_noproc.out +++ b/test_regress/t/t_gantt_io_noproc.out @@ -7,6 +7,7 @@ Argument settings: Summary: Total elapsed time = 23415 rdtsc ticks Parallelized code = 82.51% of elapsed time + Waiting time = 8.54% of elapsed time Total threads = 2 Total CPUs used = 2 Total mtasks = 7