parent
2ccaae77ae
commit
57fa98e52c
|
|
@ -937,13 +937,14 @@ macro-task's dataset fits in one core's local caches.
|
|||
|
||||
To achieve spatial locality, we tag each variable with the set of
|
||||
macro-tasks that access it. Let's call this set the "footprint" of that
|
||||
variable. The variables in a given module have a set of footprints. We can
|
||||
order those footprints to minimize the distance between them (distance is
|
||||
the number of macro-tasks that are different across any two footprints) and
|
||||
then emit all variables into the struct in ordered-footprint order.
|
||||
variable. The variables in a given module have a set of footprints. We group
|
||||
variables with identical non-empty footprints, emit those groups in deterministic
|
||||
footprint-key order, then emit variables with no footprint information last.
|
||||
|
||||
The footprint ordering is literally the traveling salesman problem, and we
|
||||
use a TSP-approximation algorithm to get close to an optimal sort.
|
||||
The first emitted variable in each footprint group is aligned to a cache-line
|
||||
boundary. This avoids false sharing between different macro-task footprints
|
||||
without building a complete pairwise-distance graph over all footprints, which
|
||||
would use excessive memory on very large models.
|
||||
|
||||
This is an old idea. Simulators designed at DEC in the early 1990s used
|
||||
similar techniques to optimize both single-thread and multithread modes.
|
||||
|
|
|
|||
|
|
@ -2008,6 +2008,7 @@ class AstVar final : public AstNode {
|
|||
bool m_globalConstrained : 1; // Global constraint per IEEE 1800-2023 18.5.8
|
||||
bool m_isStdRandomizeArg : 1; // Argument variable created for std::randomize (__Varg*)
|
||||
bool m_processQueue : 1; // Process queue variable
|
||||
bool m_mtaskCacheLineAlign : 1; // Start MTask affinity group on a cache line
|
||||
void init() {
|
||||
m_ansi = false;
|
||||
m_declTyped = false;
|
||||
|
|
@ -2070,6 +2071,7 @@ class AstVar final : public AstNode {
|
|||
m_globalConstrained = false;
|
||||
m_isStdRandomizeArg = false;
|
||||
m_processQueue = false;
|
||||
m_mtaskCacheLineAlign = false;
|
||||
}
|
||||
|
||||
public:
|
||||
|
|
@ -2318,6 +2320,8 @@ public:
|
|||
}
|
||||
bool isUsedParam() const { return m_usedParam; }
|
||||
bool isUsedLoopIdx() const { return m_usedLoopIdx; }
|
||||
bool mtaskCacheLineAlign() const { return m_mtaskCacheLineAlign; }
|
||||
void mtaskCacheLineAlign(bool flag) { m_mtaskCacheLineAlign = flag; }
|
||||
bool isSc() const VL_MT_SAFE { return m_sc; }
|
||||
bool isScQuad() const;
|
||||
bool isScBv() const VL_MT_STABLE;
|
||||
|
|
|
|||
|
|
@ -180,6 +180,7 @@ void EmitCBaseVisitorConst::emitCFuncDecl(const AstCFunc* funcp, const AstNodeMo
|
|||
void EmitCBaseVisitorConst::emitVarDecl(const AstVar* nodep, bool asRef) {
|
||||
const AstBasicDType* const basicp = nodep->basicp();
|
||||
const bool refNeedParens = VN_IS(nodep->dtypeSkipRefp(), UnpackArrayDType);
|
||||
if (nodep->mtaskCacheLineAlign() && !asRef) putns(nodep, "alignas(VL_CACHE_LINE_BYTES) ");
|
||||
|
||||
const auto emitDeclArrayBrackets = [this](const AstVar* nodep) -> void {
|
||||
// This isn't very robust and may need cleanup for other data types
|
||||
|
|
|
|||
|
|
@ -27,9 +27,9 @@
|
|||
#include "V3AstUserAllocator.h"
|
||||
#include "V3EmitCBase.h"
|
||||
#include "V3ExecGraph.h"
|
||||
#include "V3TSP.h"
|
||||
#include "V3ThreadPool.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
VL_DEFINE_DEBUG_FUNCTIONS;
|
||||
|
|
@ -90,41 +90,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
//######################################################################
|
||||
// Establish mtask variable sort order in mtasks mode
|
||||
|
||||
class VarTspSorter final : public V3TSP::TspStateBase {
|
||||
// MEMBERS
|
||||
const MTaskIdVec& m_mTaskIds; // Mtask we're ordering
|
||||
static uint32_t s_serialNext; // Unique ID to establish serial order
|
||||
const uint32_t m_serial = ++s_serialNext; // Serial ordering
|
||||
public:
|
||||
// CONSTRUCTORS
|
||||
explicit VarTspSorter(const MTaskIdVec& mTaskIds)
|
||||
: m_mTaskIds{mTaskIds} {
|
||||
UASSERT(mTaskIds.size() == ExecMTask::numUsedIds(), "Wrong size for MTask ID vector");
|
||||
}
|
||||
~VarTspSorter() override = default;
|
||||
// METHODS
|
||||
bool operator<(const TspStateBase& other) const override {
|
||||
return operator<(static_cast<const VarTspSorter&>(other));
|
||||
}
|
||||
bool operator<(const VarTspSorter& other) const { return m_serial < other.m_serial; }
|
||||
const MTaskIdVec& mTaskIds() const { return m_mTaskIds; }
|
||||
int cost(const TspStateBase* otherp) const override VL_MT_SAFE {
|
||||
return cost(static_cast<const VarTspSorter*>(otherp));
|
||||
}
|
||||
int cost(const VarTspSorter* otherp) const VL_MT_SAFE {
|
||||
// Compute the number of MTasks not shared (Hamming distance)
|
||||
int cost = 0;
|
||||
const size_t size = ExecMTask::numUsedIds();
|
||||
for (size_t i = 0; i < size; ++i) cost += m_mTaskIds.at(i) ^ otherp->m_mTaskIds.at(i);
|
||||
return cost;
|
||||
}
|
||||
};
|
||||
|
||||
uint32_t VarTspSorter::s_serialNext = 0;
|
||||
|
||||
struct VarAttributes final {
|
||||
uint8_t stratum; // Roughly equivalent to alignment requirement, to avoid padding
|
||||
bool anonOk; // Can be emitted as part of anonymous structure
|
||||
|
|
@ -165,10 +130,14 @@ class VariableOrder final {
|
|||
});
|
||||
}
|
||||
|
||||
static bool emptyAffinity(const MTaskIdVec& vec) {
|
||||
return std::find(vec.begin(), vec.end(), true) == vec.end();
|
||||
}
|
||||
|
||||
// Sort by MTask-affinity first, then the same as simpleSortVars
|
||||
void tspSortVars(std::vector<AstVar*>& varps) {
|
||||
void mtaskSortVars(std::vector<AstVar*>& varps) {
|
||||
// Map from "MTask affinity" -> "variable list"
|
||||
std::map<const MTaskIdVec, std::vector<AstVar*>> m2v;
|
||||
std::map<MTaskIdVec, std::vector<AstVar*>> m2v;
|
||||
const MTaskIdVec emptyVec(ExecMTask::numUsedIds(), false);
|
||||
for (AstVar* const varp : varps) {
|
||||
const auto it = m_mTaskAffinity.find(varp);
|
||||
|
|
@ -176,35 +145,38 @@ class VariableOrder final {
|
|||
m2v[key].push_back(varp);
|
||||
}
|
||||
|
||||
// Create a TSP sort state for each unique MTaskIdSet, except for the empty set
|
||||
V3TSP::StateVec states;
|
||||
for (const auto& pair : m2v) {
|
||||
const MTaskIdVec& vec = pair.first;
|
||||
const bool empty = std::find(vec.begin(), vec.end(), true) == vec.end();
|
||||
if (!empty) states.push_back(new VarTspSorter{vec});
|
||||
}
|
||||
|
||||
// Do the TSP sort
|
||||
V3TSP::StateVec sortedStates;
|
||||
V3TSP::tspSort(states, &sortedStates);
|
||||
|
||||
varps.clear();
|
||||
|
||||
// Helper function to sort given vector, then append to 'varps'
|
||||
const auto sortAndAppend = [this, &varps](std::vector<AstVar*>& subVarps) {
|
||||
const auto sortAndAppend = [this, &varps](std::vector<AstVar*>& subVarps,
|
||||
bool alignFirst) {
|
||||
simpleSortVars(subVarps);
|
||||
for (AstVar* const varp : subVarps) varps.push_back(varp);
|
||||
bool aligned = !alignFirst;
|
||||
for (AstVar* const varp : subVarps) {
|
||||
if (!aligned && !varp->isStatic()) {
|
||||
varp->mtaskCacheLineAlign(true);
|
||||
V3Stats::addStatSum("VariableOrder, MTask aligned group starts", 1);
|
||||
aligned = true;
|
||||
}
|
||||
varps.push_back(varp);
|
||||
}
|
||||
};
|
||||
|
||||
// Enumerate by sorted MTaskIdSet, sort within the set separately
|
||||
for (const V3TSP::TspStateBase* const stateBasep : sortedStates) {
|
||||
const VarTspSorter* const statep = dynamic_cast<const VarTspSorter*>(stateBasep);
|
||||
sortAndAppend(m2v[statep->mTaskIds()]);
|
||||
VL_DO_DANGLING(delete statep, statep);
|
||||
// Sort non-empty MTask affinity groups in the map's deterministic key order. This keeps
|
||||
// memory linear in the number of affinity groups, unlike the old complete pairwise-distance
|
||||
// ordering.
|
||||
size_t affinityGroups = 0;
|
||||
for (auto& pair : m2v) {
|
||||
if (emptyAffinity(pair.first)) continue;
|
||||
sortAndAppend(pair.second, true);
|
||||
++affinityGroups;
|
||||
}
|
||||
|
||||
// Finally add the variables with no known MTask affinity
|
||||
sortAndAppend(m2v[emptyVec]);
|
||||
sortAndAppend(m2v[emptyVec], false);
|
||||
|
||||
V3Stats::addStatSum("VariableOrder, MTask affinity groups", affinityGroups);
|
||||
V3Stats::addStatSum("VariableOrder, no-affinity variables", m2v[emptyVec].size());
|
||||
}
|
||||
|
||||
// cppcheck-suppress constParameterPointer
|
||||
|
|
@ -236,7 +208,7 @@ class VariableOrder final {
|
|||
if (!v3Global.opt.mtasks()) {
|
||||
simpleSortVars(m_varps);
|
||||
} else {
|
||||
tspSortVars(m_varps);
|
||||
mtaskSortVars(m_varps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
#!/usr/bin/env python3
|
||||
# DESCRIPTION: Verilator: Verilog Test driver/expect definition
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify it
|
||||
# under the terms of either the GNU Lesser General Public License Version 3
|
||||
# or the Perl Artistic License Version 2.0.
|
||||
# SPDX-FileCopyrightText: 2026 Wilson Snyder
|
||||
# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
|
||||
|
||||
import vltest_bootstrap
|
||||
|
||||
test.scenarios('vlt_all')
|
||||
test.top_filename = test.obj_dir + "/t_variable_order_mtask.v"
|
||||
|
||||
|
||||
def gen(filename, nregs):
|
||||
with open(filename, 'w', encoding="utf8") as fh:
|
||||
fh.write("// Generated by t_variable_order_mtask.py\n")
|
||||
fh.write("module t(input logic clk, input logic [7:0] i, output logic [31:0] o);\n")
|
||||
for i in range(nregs):
|
||||
fh.write(f" logic [31:0] r{i};\n")
|
||||
for i in range(nregs):
|
||||
fh.write(" always_ff @(posedge clk) begin\n")
|
||||
fh.write(f" r{i} <= (r{i} + {{24'd0, i}}) ^ 32'h{i + 1:08x};\n")
|
||||
fh.write(" end\n")
|
||||
fh.write(" always_comb begin\n")
|
||||
fh.write(" o = 32'h0")
|
||||
for i in range(nregs):
|
||||
fh.write(f" ^ r{i}")
|
||||
fh.write(";\n")
|
||||
fh.write(" end\n")
|
||||
fh.write("endmodule\n")
|
||||
|
||||
|
||||
gen(test.top_filename, 24)
|
||||
|
||||
flags = ["--cc", "--stats", "-Wno-UNOPTTHREADS"]
|
||||
if test.vltmt:
|
||||
flags += ["--threads-max-mtasks 8"]
|
||||
|
||||
test.compile(verilator_flags2=flags, threads=(2 if test.vltmt else 1))
|
||||
|
||||
root_h = test.obj_dir + "/" + test.vm_prefix + "___024root.h"
|
||||
aligned_var_re = r'alignas\(VL_CACHE_LINE_BYTES\) (?:CData|SData|IData|QData|VlWide|VL_)'
|
||||
|
||||
if test.vltmt:
|
||||
test.file_grep(root_h, aligned_var_re)
|
||||
test.file_grep(test.stats, r'VariableOrder, MTask affinity groups\s+([1-9]\d*)')
|
||||
test.file_grep(test.stats, r'VariableOrder, MTask aligned group starts\s+([1-9]\d*)')
|
||||
else:
|
||||
test.file_grep_not(root_h, aligned_var_re)
|
||||
test.file_grep_not(test.stats, r'VariableOrder, MTask affinity groups')
|
||||
test.file_grep_not(test.stats, r'VariableOrder, MTask aligned group starts')
|
||||
|
||||
test.passes()
|
||||
Loading…
Reference in New Issue