Ignore dependencies from different hierarchical schedules (#5954)

Signed-off-by: Bartłomiej Chmiel <bchmiel@antmicro.com>
This commit is contained in:
Bartłomiej Chmiel 2025-05-08 12:45:10 +02:00 committed by GitHub
parent a80aa07de6
commit 5f4646f617
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 133 additions and 36 deletions

View File

@ -66,6 +66,7 @@ class ThreadSchedule final {
uint32_t m_id; // Unique ID of a schedule uint32_t m_id; // Unique ID of a schedule
static uint32_t s_nextId; // Next ID number to use static uint32_t s_nextId; // Next ID number to use
std::unordered_set<const ExecMTask*> mtasks; // Mtasks in this schedule std::unordered_set<const ExecMTask*> mtasks; // Mtasks in this schedule
uint32_t m_endTime = 0; // Latest task end time in this schedule
public: public:
// CONSTANTS // CONSTANTS
@ -196,6 +197,7 @@ public:
uint32_t scheduleOn(const ExecMTask* mtaskp, uint32_t bestThreadId) { uint32_t scheduleOn(const ExecMTask* mtaskp, uint32_t bestThreadId) {
mtasks.emplace(mtaskp); mtasks.emplace(mtaskp);
const uint32_t bestEndTime = mtaskp->predictStart() + mtaskp->cost(); const uint32_t bestEndTime = mtaskp->predictStart() + mtaskp->cost();
m_endTime = std::max(m_endTime, bestEndTime);
mtaskState[mtaskp].completionTime = bestEndTime; mtaskState[mtaskp].completionTime = bestEndTime;
mtaskState[mtaskp].threadId = bestThreadId; mtaskState[mtaskp].threadId = bestThreadId;
@ -208,6 +210,7 @@ public:
return bestEndTime; return bestEndTime;
} }
bool contains(const ExecMTask* mtaskp) const { return mtasks.count(mtaskp); } bool contains(const ExecMTask* mtaskp) const { return mtasks.count(mtaskp); }
uint32_t endTime() const { return m_endTime; }
}; };
uint32_t ThreadSchedule::s_nextId = 0; uint32_t ThreadSchedule::s_nextId = 0;
@ -256,6 +259,8 @@ class PackThreads final {
// METHODS // METHODS
uint32_t completionTime(const ThreadSchedule& schedule, const ExecMTask* mtaskp, uint32_t completionTime(const ThreadSchedule& schedule, const ExecMTask* mtaskp,
uint32_t threadId) { uint32_t threadId) {
// Ignore tasks that were scheduled on a different schedule
if (!schedule.contains(mtaskp)) return 0;
const ThreadSchedule::MTaskState& state = schedule.mtaskState.at(mtaskp); const ThreadSchedule::MTaskState& state = schedule.mtaskState.at(mtaskp);
UASSERT(state.threadId != ThreadSchedule::UNASSIGNED, "Mtask should have assigned thread"); UASSERT(state.threadId != ThreadSchedule::UNASSIGNED, "Mtask should have assigned thread");
if (threadId == state.threadId) { if (threadId == state.threadId) {
@ -373,19 +378,24 @@ class PackThreads final {
} }
} }
const uint32_t endTime = schedule.endTime();
if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_DISCOVERED) { if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_DISCOVERED) {
mode = SchedulingMode::WIDE_TASK_SCHEDULING; mode = SchedulingMode::WIDE_TASK_SCHEDULING;
const uint32_t size = m_nThreads / maxThreadWorkers; const uint32_t size = m_nThreads / maxThreadWorkers;
UASSERT(size, "Thread pool size should be bigger than 0"); UASSERT(size, "Thread pool size should be bigger than 0");
// If no tasks were added to the normal thread schedule, remove it. // If no tasks were added to the normal thread schedule, clear it.
if (schedule.mtaskState.empty()) result.erase(result.begin()); if (schedule.mtaskState.empty()) result.clear();
result.emplace_back(ThreadSchedule{size}); result.emplace_back(ThreadSchedule{size});
std::fill(busyUntil.begin(), busyUntil.end(), endTime);
continue; continue;
} }
if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_SCHEDULING) { if (!bestMtaskp && mode == SchedulingMode::WIDE_TASK_SCHEDULING) {
mode = SchedulingMode::SCHEDULING; mode = SchedulingMode::SCHEDULING;
if (!schedule.mtaskState.empty()) result.emplace_back(ThreadSchedule{m_nThreads}); UASSERT(!schedule.mtaskState.empty(), "Mtask should be added");
result.emplace_back(ThreadSchedule{m_nThreads});
std::fill(busyUntil.begin(), busyUntil.end(), endTime);
continue; continue;
} }
@ -393,24 +403,7 @@ class PackThreads final {
bestMtaskp->predictStart(bestTime); bestMtaskp->predictStart(bestTime);
const uint32_t bestEndTime = schedule.scheduleOn(bestMtaskp, bestThreadId); const uint32_t bestEndTime = schedule.scheduleOn(bestMtaskp, bestThreadId);
// Populate busyUntil timestamps. For multi-worker tasks, set timestamps for
// offsetted threads.
if (mode != SchedulingMode::WIDE_TASK_SCHEDULING) {
busyUntil[bestThreadId] = bestEndTime; busyUntil[bestThreadId] = bestEndTime;
} else {
for (int i = 0; i < maxThreadWorkers; ++i) {
const size_t threadId = bestThreadId + (i * schedule.threads.size());
UASSERT(threadId < busyUntil.size(),
"Incorrect busyUntil offset: threadId=" + cvtToStr(threadId)
+ " bestThreadId=" + cvtToStr(bestThreadId) + " i=" + cvtToStr(i)
+ " schedule-size=" + cvtToStr(schedule.threads.size())
+ " maxThreadWorkers=" + cvtToStr(maxThreadWorkers));
busyUntil[threadId] = bestEndTime;
UINFO(6, "Will schedule " << bestMtaskp->name() << " onto thread " << threadId
<< endl);
}
}
// Update the ready list // Update the ready list
const size_t erased = readyMTasks.erase(bestMtaskp); const size_t erased = readyMTasks.erase(bestMtaskp);
@ -439,6 +432,10 @@ class PackThreads final {
public: public:
// SELF TEST // SELF TEST
static void selfTest() { static void selfTest() {
selfTestHierFirst();
selfTestNormalFirst();
}
static void selfTestNormalFirst() {
V3Graph graph; V3Graph graph;
FileLine* const flp = v3Global.rootp()->fileline(); FileLine* const flp = v3Global.rootp()->fileline();
std::vector<AstMTaskBody*> mTaskBodyps; std::vector<AstMTaskBody*> mTaskBodyps;
@ -466,6 +463,12 @@ public:
t4->cost(100); t4->cost(100);
t4->priority(100); t4->priority(100);
t4->threads(3); t4->threads(3);
ExecMTask* const t5 = new ExecMTask{&graph, makeBody()};
t5->cost(100);
t5->priority(100);
ExecMTask* const t6 = new ExecMTask{&graph, makeBody()};
t6->cost(100);
t6->priority(100);
/* /*
0 0
@ -473,11 +476,15 @@ public:
1 2 1 2
/ \ / \
3 4 3 4
/ \
5 6
*/ */
new V3GraphEdge{&graph, t0, t1, 1}; new V3GraphEdge{&graph, t0, t1, 1};
new V3GraphEdge{&graph, t0, t2, 1}; new V3GraphEdge{&graph, t0, t2, 1};
new V3GraphEdge{&graph, t2, t3, 1}; new V3GraphEdge{&graph, t2, t3, 1};
new V3GraphEdge{&graph, t2, t4, 1}; new V3GraphEdge{&graph, t2, t4, 1};
new V3GraphEdge{&graph, t3, t5, 1};
new V3GraphEdge{&graph, t4, t6, 1};
constexpr uint32_t threads = 6; constexpr uint32_t threads = 6;
PackThreads packer{threads, PackThreads packer{threads,
@ -485,6 +492,7 @@ public:
10}; // Sandbag denom 10}; // Sandbag denom
const std::vector<ThreadSchedule> scheduled = packer.pack(graph); const std::vector<ThreadSchedule> scheduled = packer.pack(graph);
UASSERT_SELFTEST(size_t, scheduled.size(), 3);
UASSERT_SELFTEST(size_t, scheduled[0].threads.size(), threads); UASSERT_SELFTEST(size_t, scheduled[0].threads.size(), threads);
UASSERT_SELFTEST(size_t, scheduled[0].threads[0].size(), 2); UASSERT_SELFTEST(size_t, scheduled[0].threads[0].size(), 2);
for (size_t i = 1; i < scheduled[0].threads.size(); ++i) for (size_t i = 1; i < scheduled[0].threads.size(); ++i)
@ -494,17 +502,23 @@ public:
UASSERT_SELFTEST(const ExecMTask*, scheduled[0].threads[0][1], t1); UASSERT_SELFTEST(const ExecMTask*, scheduled[0].threads[0][1], t1);
UASSERT_SELFTEST(size_t, scheduled[1].threads.size(), threads / 3); UASSERT_SELFTEST(size_t, scheduled[1].threads.size(), threads / 3);
UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[1][0], t2); UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[0][0], t2);
UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[1][1], t3); UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[0][1], t3);
UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[0][0], t4); UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[1][0], t4);
UASSERT_SELFTEST(size_t, ThreadSchedule::mtaskState.size(), 5); UASSERT_SELFTEST(size_t, scheduled[2].threads.size(), threads);
UASSERT_SELFTEST(const ExecMTask*, scheduled[2].threads[0][0], t5);
UASSERT_SELFTEST(const ExecMTask*, scheduled[2].threads[1][0], t6);
UASSERT_SELFTEST(size_t, ThreadSchedule::mtaskState.size(), 7);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t0), 0); UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t0), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t1), 0); UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t1), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t2), 1); UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t2), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t3), 1); UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t3), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t4), 0); UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t4), 1);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t5), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t6), 1);
// On its native thread, we see the actual end time for t0: // On its native thread, we see the actual end time for t0:
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t0, 0), 1000); UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t0, 0), 1000);
@ -518,14 +532,97 @@ public:
// with t0's sandbagged time; compounding caused trouble in // with t0's sandbagged time; compounding caused trouble in
// practice. // practice.
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t1, 1), 1130); UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t1, 1), 1130);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 0), 1229);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 1), 1199); // Wide task scheduling
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 0), 1329);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 1), 1299); // Task does not depend on previous or future schedules
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 0), 1329); UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t2, 0), 0);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 1), 1359); UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[2], t2, 0), 0);
// We allow sandbagging for hierarchical children tasks, this does not affect
// wide task scheduling. When the next schedule is created it doesn't matter
// anyway.
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 0), 1200);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 1), 1230);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 2), 1230);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 3), 1230);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 4), 1230);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t2, 5), 1230);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 0), 1300);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 1), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 2), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 3), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 4), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t3, 5), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 0), 1360);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 1), 1330);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 2), 1360);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 3), 1360);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 4), 1360);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t4, 5), 1360);
for (AstNode* const nodep : mTaskBodyps) nodep->deleteTree(); for (AstNode* const nodep : mTaskBodyps) nodep->deleteTree();
ThreadSchedule::mtaskState.clear();
}
static void selfTestHierFirst() {
V3Graph graph;
FileLine* const flp = v3Global.rootp()->fileline();
std::vector<AstMTaskBody*> mTaskBodyps;
const auto makeBody = [&]() {
AstMTaskBody* const bodyp = new AstMTaskBody{flp};
mTaskBodyps.push_back(bodyp);
bodyp->addStmtsp(new AstComment{flp, ""});
return bodyp;
};
ExecMTask* const t0 = new ExecMTask{&graph, makeBody()};
t0->cost(1000);
t0->priority(1100);
t0->threads(2);
ExecMTask* const t1 = new ExecMTask{&graph, makeBody()};
t1->cost(100);
t1->priority(100);
/*
0
|
1
*/
new V3GraphEdge{&graph, t0, t1, 1};
constexpr uint32_t threads = 2;
PackThreads packer{threads,
3, // Sandbag numerator
10}; // Sandbag denom
const std::vector<ThreadSchedule> scheduled = packer.pack(graph);
UASSERT_SELFTEST(size_t, scheduled.size(), 2);
UASSERT_SELFTEST(size_t, scheduled[0].threads.size(), threads / 2);
UASSERT_SELFTEST(size_t, scheduled[0].threads[0].size(), 1);
for (size_t i = 1; i < scheduled[0].threads.size(); ++i)
UASSERT_SELFTEST(size_t, scheduled[0].threads[i].size(), 0);
UASSERT_SELFTEST(const ExecMTask*, scheduled[0].threads[0][0], t0);
UASSERT_SELFTEST(size_t, scheduled[1].threads.size(), threads);
UASSERT_SELFTEST(size_t, scheduled[1].threads[0].size(), 1);
for (size_t i = 1; i < scheduled[1].threads.size(); ++i)
UASSERT_SELFTEST(size_t, scheduled[1].threads[i].size(), 0);
UASSERT_SELFTEST(const ExecMTask*, scheduled[1].threads[0][0], t1);
UASSERT_SELFTEST(size_t, ThreadSchedule::mtaskState.size(), 2);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t0), 0);
UASSERT_SELFTEST(uint32_t, ThreadSchedule::threadId(t1), 0);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[0], t0, 0), 1000);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t1, 0), 1100);
UASSERT_SELFTEST(uint32_t, packer.completionTime(scheduled[1], t1, 1), 1130);
for (AstNode* const nodep : mTaskBodyps) nodep->deleteTree();
ThreadSchedule::mtaskState.clear();
} }
static std::vector<ThreadSchedule> apply(V3Graph& mtaskGraph) { static std::vector<ThreadSchedule> apply(V3Graph& mtaskGraph) {

View File

@ -36,7 +36,7 @@ if test.vltmt:
test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt", test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt",
r'Optimizations, Thread schedule count\s+(\d+)', 4) r'Optimizations, Thread schedule count\s+(\d+)', 4)
test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt", test.file_grep(test.obj_dir + "/V" + test.name + "__hier.dir/V" + test.name + "__stats.txt",
r'Optimizations, Thread schedule total tasks\s+(\d+)', 10) r'Optimizations, Thread schedule total tasks\s+(\d+)', 12)
test.execute(all_run_flags=[ test.execute(all_run_flags=[
"+verilator+prof+exec+start+2", "+verilator+prof+exec+start+2",