Fix TSP variable ordering for mtasks (#5342) (#7610)

Fixes #5342
This commit is contained in:
Muzaffer Kal 2026-05-30 12:35:12 -07:00 committed by GitHub
parent 2ccaae77ae
commit 57fa98e52c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 98 additions and 65 deletions

View File

@ -937,13 +937,14 @@ macro-task's dataset fits in one core's local caches.
To achieve spatial locality, we tag each variable with the set of
macro-tasks that access it. Let's call this set the "footprint" of that
variable. The variables in a given module have a set of footprints. We can
order those footprints to minimize the distance between them (distance is
the number of macro-tasks that are different across any two footprints) and
then emit all variables into the struct in ordered-footprint order.
variable. The variables in a given module have a set of footprints. We group
variables with identical non-empty footprints, emit those groups in deterministic
footprint-key order, then emit variables with no footprint information last.
The footprint ordering is literally the traveling salesman problem, and we
use a TSP-approximation algorithm to get close to an optimal sort.
The first emitted variable in each footprint group is aligned to a cache-line
boundary. This avoids false sharing between different macro-task footprints
without building a complete pairwise-distance graph over all footprints, which
would use excessive memory on very large models.
This is an old idea. Simulators designed at DEC in the early 1990s used
similar techniques to optimize both single-thread and multithread modes.

View File

@ -2008,6 +2008,7 @@ class AstVar final : public AstNode {
bool m_globalConstrained : 1; // Global constraint per IEEE 1800-2023 18.5.8
bool m_isStdRandomizeArg : 1; // Argument variable created for std::randomize (__Varg*)
bool m_processQueue : 1; // Process queue variable
bool m_mtaskCacheLineAlign : 1; // Start MTask affinity group on a cache line
void init() {
m_ansi = false;
m_declTyped = false;
@ -2070,6 +2071,7 @@ class AstVar final : public AstNode {
m_globalConstrained = false;
m_isStdRandomizeArg = false;
m_processQueue = false;
m_mtaskCacheLineAlign = false;
}
public:
@ -2318,6 +2320,8 @@ public:
}
bool isUsedParam() const { return m_usedParam; }
bool isUsedLoopIdx() const { return m_usedLoopIdx; }
bool mtaskCacheLineAlign() const { return m_mtaskCacheLineAlign; }
void mtaskCacheLineAlign(bool flag) { m_mtaskCacheLineAlign = flag; }
bool isSc() const VL_MT_SAFE { return m_sc; }
bool isScQuad() const;
bool isScBv() const VL_MT_STABLE;

View File

@ -180,6 +180,7 @@ void EmitCBaseVisitorConst::emitCFuncDecl(const AstCFunc* funcp, const AstNodeMo
void EmitCBaseVisitorConst::emitVarDecl(const AstVar* nodep, bool asRef) {
const AstBasicDType* const basicp = nodep->basicp();
const bool refNeedParens = VN_IS(nodep->dtypeSkipRefp(), UnpackArrayDType);
if (nodep->mtaskCacheLineAlign() && !asRef) putns(nodep, "alignas(VL_CACHE_LINE_BYTES) ");
const auto emitDeclArrayBrackets = [this](const AstVar* nodep) -> void {
// This isn't very robust and may need cleanup for other data types

View File

@ -27,9 +27,9 @@
#include "V3AstUserAllocator.h"
#include "V3EmitCBase.h"
#include "V3ExecGraph.h"
#include "V3TSP.h"
#include "V3ThreadPool.h"
#include <algorithm>
#include <vector>
VL_DEFINE_DEBUG_FUNCTIONS;
@ -90,41 +90,6 @@ public:
}
};
//######################################################################
// Establish mtask variable sort order in mtasks mode
class VarTspSorter final : public V3TSP::TspStateBase {
// MEMBERS
const MTaskIdVec& m_mTaskIds; // Mtask we're ordering
static uint32_t s_serialNext; // Unique ID to establish serial order
const uint32_t m_serial = ++s_serialNext; // Serial ordering
public:
// CONSTRUCTORS
explicit VarTspSorter(const MTaskIdVec& mTaskIds)
: m_mTaskIds{mTaskIds} {
UASSERT(mTaskIds.size() == ExecMTask::numUsedIds(), "Wrong size for MTask ID vector");
}
~VarTspSorter() override = default;
// METHODS
bool operator<(const TspStateBase& other) const override {
return operator<(static_cast<const VarTspSorter&>(other));
}
bool operator<(const VarTspSorter& other) const { return m_serial < other.m_serial; }
const MTaskIdVec& mTaskIds() const { return m_mTaskIds; }
int cost(const TspStateBase* otherp) const override VL_MT_SAFE {
return cost(static_cast<const VarTspSorter*>(otherp));
}
int cost(const VarTspSorter* otherp) const VL_MT_SAFE {
// Compute the number of MTasks not shared (Hamming distance)
int cost = 0;
const size_t size = ExecMTask::numUsedIds();
for (size_t i = 0; i < size; ++i) cost += m_mTaskIds.at(i) ^ otherp->m_mTaskIds.at(i);
return cost;
}
};
uint32_t VarTspSorter::s_serialNext = 0;
struct VarAttributes final {
uint8_t stratum; // Roughly equivalent to alignment requirement, to avoid padding
bool anonOk; // Can be emitted as part of anonymous structure
@ -165,10 +130,14 @@ class VariableOrder final {
});
}
static bool emptyAffinity(const MTaskIdVec& vec) {
return std::find(vec.begin(), vec.end(), true) == vec.end();
}
// Sort by MTask-affinity first, then the same as simpleSortVars
void tspSortVars(std::vector<AstVar*>& varps) {
void mtaskSortVars(std::vector<AstVar*>& varps) {
// Map from "MTask affinity" -> "variable list"
std::map<const MTaskIdVec, std::vector<AstVar*>> m2v;
std::map<MTaskIdVec, std::vector<AstVar*>> m2v;
const MTaskIdVec emptyVec(ExecMTask::numUsedIds(), false);
for (AstVar* const varp : varps) {
const auto it = m_mTaskAffinity.find(varp);
@ -176,35 +145,38 @@ class VariableOrder final {
m2v[key].push_back(varp);
}
// Create a TSP sort state for each unique MTaskIdSet, except for the empty set
V3TSP::StateVec states;
for (const auto& pair : m2v) {
const MTaskIdVec& vec = pair.first;
const bool empty = std::find(vec.begin(), vec.end(), true) == vec.end();
if (!empty) states.push_back(new VarTspSorter{vec});
}
// Do the TSP sort
V3TSP::StateVec sortedStates;
V3TSP::tspSort(states, &sortedStates);
varps.clear();
// Helper function to sort given vector, then append to 'varps'
const auto sortAndAppend = [this, &varps](std::vector<AstVar*>& subVarps) {
const auto sortAndAppend = [this, &varps](std::vector<AstVar*>& subVarps,
bool alignFirst) {
simpleSortVars(subVarps);
for (AstVar* const varp : subVarps) varps.push_back(varp);
bool aligned = !alignFirst;
for (AstVar* const varp : subVarps) {
if (!aligned && !varp->isStatic()) {
varp->mtaskCacheLineAlign(true);
V3Stats::addStatSum("VariableOrder, MTask aligned group starts", 1);
aligned = true;
}
varps.push_back(varp);
}
};
// Enumerate by sorted MTaskIdSet, sort within the set separately
for (const V3TSP::TspStateBase* const stateBasep : sortedStates) {
const VarTspSorter* const statep = dynamic_cast<const VarTspSorter*>(stateBasep);
sortAndAppend(m2v[statep->mTaskIds()]);
VL_DO_DANGLING(delete statep, statep);
// Sort non-empty MTask affinity groups in the map's deterministic key order. This keeps
// memory linear in the number of affinity groups, unlike the old complete pairwise-distance
// ordering.
size_t affinityGroups = 0;
for (auto& pair : m2v) {
if (emptyAffinity(pair.first)) continue;
sortAndAppend(pair.second, true);
++affinityGroups;
}
// Finally add the variables with no known MTask affinity
sortAndAppend(m2v[emptyVec]);
sortAndAppend(m2v[emptyVec], false);
V3Stats::addStatSum("VariableOrder, MTask affinity groups", affinityGroups);
V3Stats::addStatSum("VariableOrder, no-affinity variables", m2v[emptyVec].size());
}
// cppcheck-suppress constParameterPointer
@ -236,7 +208,7 @@ class VariableOrder final {
if (!v3Global.opt.mtasks()) {
simpleSortVars(m_varps);
} else {
tspSortVars(m_varps);
mtaskSortVars(m_varps);
}
}
}

View File

@ -0,0 +1,55 @@
#!/usr/bin/env python3
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of either the GNU Lesser General Public License Version 3
# or the Perl Artistic License Version 2.0.
# SPDX-FileCopyrightText: 2026 Wilson Snyder
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
import vltest_bootstrap
test.scenarios('vlt_all')
test.top_filename = test.obj_dir + "/t_variable_order_mtask.v"
def gen(filename, nregs):
with open(filename, 'w', encoding="utf8") as fh:
fh.write("// Generated by t_variable_order_mtask.py\n")
fh.write("module t(input logic clk, input logic [7:0] i, output logic [31:0] o);\n")
for i in range(nregs):
fh.write(f" logic [31:0] r{i};\n")
for i in range(nregs):
fh.write(" always_ff @(posedge clk) begin\n")
fh.write(f" r{i} <= (r{i} + {{24'd0, i}}) ^ 32'h{i + 1:08x};\n")
fh.write(" end\n")
fh.write(" always_comb begin\n")
fh.write(" o = 32'h0")
for i in range(nregs):
fh.write(f" ^ r{i}")
fh.write(";\n")
fh.write(" end\n")
fh.write("endmodule\n")
gen(test.top_filename, 24)
flags = ["--cc", "--stats", "-Wno-UNOPTTHREADS"]
if test.vltmt:
flags += ["--threads-max-mtasks 8"]
test.compile(verilator_flags2=flags, threads=(2 if test.vltmt else 1))
root_h = test.obj_dir + "/" + test.vm_prefix + "___024root.h"
aligned_var_re = r'alignas\(VL_CACHE_LINE_BYTES\) (?:CData|SData|IData|QData|VlWide|VL_)'
if test.vltmt:
test.file_grep(root_h, aligned_var_re)
test.file_grep(test.stats, r'VariableOrder, MTask affinity groups\s+([1-9]\d*)')
test.file_grep(test.stats, r'VariableOrder, MTask aligned group starts\s+([1-9]\d*)')
else:
test.file_grep_not(root_h, aligned_var_re)
test.file_grep_not(test.stats, r'VariableOrder, MTask affinity groups')
test.file_grep_not(test.stats, r'VariableOrder, MTask aligned group starts')
test.passes()